Skip to main content

dbmd_core/
stats.rs

1//! `stats` — store overview, **computed on demand** (a SWEEP, like `du` —
2//! never a maintained or precomputed cache).
3//!
4//! Serves both the human (how big is my brain, what's the shape) and the agent
5//! (orientation). Deliberately excludes graph density / degree / top-linked
6//! analytics — low agent value, and a human who wants graph metrics opens the
7//! store in Obsidian, so we never build the full graph just for stats.
8
9use std::collections::{BTreeMap, HashSet};
10use std::path::{Path, PathBuf};
11
12use regex::Regex;
13
14use crate::store::{Layer, Store};
15
16/// A point-in-time overview of a store. Pure data; the CLI formats it to text
17/// or JSON.
18#[derive(Debug, Clone, Default, PartialEq)]
19pub struct Stats {
20    /// Total content-file count across all layers.
21    pub total_files: usize,
22    /// File count per layer.
23    pub files_per_layer: BTreeMap<Layer, usize>,
24    /// Total size on disk, in bytes.
25    pub total_size_bytes: u64,
26    /// Count per `type:` value (the type distribution).
27    pub type_distribution: BTreeMap<String, usize>,
28    /// Number of orphan files (no incoming and no outgoing wiki-links).
29    pub orphan_count: usize,
30    /// Number of broken wiki-links (target file doesn't exist).
31    pub broken_link_count: usize,
32    /// Top types by count, descending (ties broken by type name ascending).
33    pub top_types: Vec<(String, usize)>,
34}
35
36/// How many entries [`Stats::top_types`] holds.
37const TOP_TYPES_LIMIT: usize = 10;
38
39/// One content file discovered by the SWEEP, with everything `stats` needs:
40/// where it lives, how big it is, its declared `type`, and the wiki-link
41/// targets it emits (store-relative, `.md` stripped, short-form excluded).
42struct FileFacts {
43    /// Store-relative path *without* the `.md` extension — the node id used to
44    /// resolve wiki-links and detect orphans.
45    node_id: PathBuf,
46    /// The layer this file lives under.
47    layer: Layer,
48    /// File size on disk, in bytes.
49    size_bytes: u64,
50    /// The declared `type:`, if the frontmatter has one.
51    type_: Option<String>,
52    /// Every wiki-link target this file emits, store-relative with any trailing
53    /// `.md` stripped, in source order (not deduped, short-form included).
54    /// Resolved against the complete node set in a second pass.
55    raw_targets: Vec<PathBuf>,
56}
57
58impl FileFacts {
59    /// The subset of [`raw_targets`](FileFacts::raw_targets) that could resolve
60    /// to a store node: full store-relative paths. Short-form targets (no `/`)
61    /// are dropped — they're a `WIKI_LINK_SHORT_FORM` validation error, not a
62    /// graph edge, so stats neither counts them as broken nor lets them wire a
63    /// file out of orphan status.
64    fn resolvable_targets(&self) -> impl Iterator<Item = &PathBuf> {
65        self.raw_targets.iter().filter(|t| is_full_path(t))
66    }
67}
68
69/// **SWEEP.** Walk the store once and compute its [`Stats`]. Run occasionally
70/// (overview / orientation), never on the interactive loop.
71pub fn compute(store: &Store) -> crate::Result<Stats> {
72    let link_re = wiki_link_regex();
73
74    // First pass: walk every layer once, recording per-file facts and the set
75    // of node ids that exist on disk. Link resolution waits for the second
76    // pass, once every node's existence is known.
77    let mut existing_nodes: HashSet<PathBuf> = HashSet::new();
78    let mut facts: Vec<FileFacts> = Vec::new();
79
80    for layer in Layer::all() {
81        let layer_root = store.root.join(layer_dir_name(layer));
82        for abs in walk_layer_content_files(&layer_root)? {
83            let rel = abs.strip_prefix(&store.root).unwrap_or(&abs).to_path_buf();
84            let node_id = strip_md(&rel);
85            existing_nodes.insert(node_id.clone());
86
87            let size_bytes = std::fs::metadata(&abs).map(|m| m.len()).unwrap_or(0);
88            let text = std::fs::read_to_string(&abs).unwrap_or_default();
89            let type_ = parse_type(&text);
90            let raw_targets = extract_link_targets(&text, &link_re);
91
92            facts.push(FileFacts {
93                node_id,
94                layer,
95                size_bytes,
96                type_,
97                raw_targets,
98            });
99        }
100    }
101
102    // Second pass: classify every file's links against the complete node set,
103    // counting broken links (full-path targets with no file on disk) and
104    // recording which nodes receive an incoming edge. Short-form targets are a
105    // validation error elsewhere, not a stats edge, so they're skipped here:
106    // they neither wire a file in nor count as broken.
107    let mut stats = Stats::default();
108    let mut linked_to: HashSet<PathBuf> = HashSet::new();
109    for file in &facts {
110        for target in file.resolvable_targets() {
111            // A self-link is not a graph edge — skip it (matches `graph::orphans`,
112            // so the two surfaces agree on whether a self-only-linking file is an
113            // orphan). It is neither incoming nor broken.
114            if target == &file.node_id {
115                continue;
116            }
117            if existing_nodes.contains(target) {
118                linked_to.insert(target.clone());
119            } else if target_resolves_on_disk(&store.root, target) {
120                // A link to an existing non-`.md` source artifact (a `.eml`,
121                // `.pdf`, …) is a live edge, not a broken one — `sources/` holds
122                // such files by design and `graph` resolves them on disk. The
123                // target has no `.md` node, so it can't be `linked_to` (no `.md`
124                // file is un-orphaned by it), but it must NOT be counted broken.
125            } else {
126                // Broken links count occurrences, not distinct targets.
127                stats.broken_link_count += 1;
128            }
129        }
130    }
131
132    // Third pass: roll the per-file facts up into the aggregate Stats. A file is
133    // an orphan iff it has neither a resolvable outgoing edge nor an incoming one.
134    for file in &facts {
135        stats.total_files += 1;
136        *stats.files_per_layer.entry(file.layer).or_insert(0) += 1;
137        stats.total_size_bytes += file.size_bytes;
138
139        if let Some(t) = &file.type_ {
140            *stats.type_distribution.entry(t.clone()).or_insert(0) += 1;
141        }
142
143        let has_outgoing = file.resolvable_targets().any(|t| {
144            t != &file.node_id
145                && (existing_nodes.contains(t) || target_resolves_on_disk(&store.root, t))
146        });
147        let has_incoming = linked_to.contains(&file.node_id);
148        if !has_outgoing && !has_incoming {
149            stats.orphan_count += 1;
150        }
151    }
152
153    stats.top_types = top_types(&stats.type_distribution, TOP_TYPES_LIMIT);
154
155    Ok(stats)
156}
157
158/// On-disk folder name for a layer. Local copy so `stats` doesn't couple to
159/// [`Layer::dir_name`].
160fn layer_dir_name(layer: Layer) -> &'static str {
161    match layer {
162        Layer::Sources => "sources",
163        Layer::Records => "records",
164    }
165}
166
167/// Recursively collect the `.md` **content** files under one layer root,
168/// skipping hidden entries (`.git`, dotfiles), the layer's immediate `log/`
169/// archive directory, and the `index.md` catalog meta files. Returns absolute
170/// paths. A missing layer root yields an empty list (a store need not have
171/// both layers).
172///
173/// Only an immediate child of the layer named `log` (`sources/log/`) is the
174/// rotation-archive directory and skipped — matching `render::tree`, which
175/// skips `log` only as an immediate layer child, and the indexer, which indexes
176/// `log` dirs nested deeper. A directory named `log` nested under a type-folder
177/// (`sources/emails/log/`) is ordinary content and is counted, so stats agrees
178/// with `tree` / `index` / `query` instead of making the subtree invisible.
179fn walk_layer_content_files(layer_root: &Path) -> crate::Result<Vec<PathBuf>> {
180    let mut out = Vec::new();
181    if !layer_root.is_dir() {
182        return Ok(out);
183    }
184    let walker = walkdir::WalkDir::new(layer_root)
185        .into_iter()
186        .filter_entry(|e| {
187            // Skip hidden dirs/files. `depth()` is relative to the layer root
188            // (root = 0), so the layer's immediate `log/` archive is depth 1.
189            let name = e.file_name().to_string_lossy();
190            if name.starts_with('.') {
191                return false;
192            }
193            if e.file_type().is_dir() && name == "log" && e.depth() == 1 {
194                return false;
195            }
196            true
197        });
198    for entry in walker {
199        let entry = entry.map_err(|e| {
200            crate::Error::Io(
201                e.into_io_error()
202                    .unwrap_or_else(|| std::io::Error::other("walk error")),
203            )
204        })?;
205        if !entry.file_type().is_file() {
206            continue;
207        }
208        let path = entry.path();
209        let name = entry.file_name().to_string_lossy();
210        // Content files are `.md`; `index.md` is a meta catalog file, not
211        // content, and `index.jsonl` / other sidecars aren't `.md` at all.
212        if !name.ends_with(".md") || name == "index.md" {
213            continue;
214        }
215        out.push(path.to_path_buf());
216    }
217    out.sort();
218    Ok(out)
219}
220
221/// The wiki-link matcher: `[[target]]` or `[[target|display]]`. Captures the
222/// target (group 1), excluding `]` and `|`. Anchored on the literal brackets so
223/// it ignores `[markdown](links)`.
224fn wiki_link_regex() -> Regex {
225    // `[^\[\]|]+` keeps the target free of brackets and the display pipe.
226    Regex::new(r"\[\[([^\[\]|]+)(?:\|[^\]]*)?\]\]").expect("static wiki-link regex is valid")
227}
228
229/// Every wiki-link target in a file (frontmatter + body), trimmed, with any
230/// trailing `.md` removed. Order-preserving (frontmatter targets first, then
231/// body); not deduped. stats deliberately counts links in BOTH regions as edges.
232///
233/// The frontmatter block and the body are scanned **separately** so fenced-code
234/// state can never leak between them. YAML frontmatter has no markdown code
235/// fences, so every `[[...]]` there is a real edge and is extracted with no
236/// fence tracking. The body is scanned with fresh fence tracking (started from
237/// no open fence), so a `[[...]]` that lives only inside a body code fence is
238/// still ignored — it is illustrative syntax, not a graph edge, mirroring
239/// `validate::extract_wiki_links` / `store::extract_edge_targets`.
240///
241/// The old single-pass-over-whole-file scan wrongly assumed "frontmatter never
242/// carries code fences": a stray ``` (or `~~~`) line inside a frontmatter value
243/// (e.g. a block-scalar field) opened a fence that swallowed every subsequent
244/// body `[[...]]`, dropping real edges and mis-marking files as orphans.
245fn extract_link_targets(text: &str, re: &Regex) -> Vec<PathBuf> {
246    let (frontmatter, body) = split_frontmatter_and_body(text);
247    let mut out = Vec::new();
248    // (a) Frontmatter: every `[[...]]` is a real edge — no fence tracking. YAML
249    // has no markdown code fences, so a ``` line here is just text, never a fence.
250    if let Some(fm) = frontmatter {
251        for line in fm.lines() {
252            collect_links_on_line(line, re, &mut out);
253        }
254    }
255    // (b) Body: fence-aware, started fresh so no state is inherited from the
256    // frontmatter block above. Track the open fence as `(fence byte, run length)`,
257    // not a single boolean: an inner fence of the *other* character (a `~~~` line
258    // inside an open ``` block, or vice versa) — or a shorter run — is content,
259    // and must NOT close the block. A naive toggle inverts the fence state on
260    // such a line and then mis-classifies every link for the rest of the body.
261    // Mirrors `render`'s `opening_fence` / `is_closing_fence`.
262    let mut fence: Option<(u8, usize)> = None;
263    for line in body.lines() {
264        let content = line.trim_end_matches(['\n', '\r']);
265        if let Some(f) = fence {
266            if is_closing_fence(content, f) {
267                fence = None;
268            }
269            continue;
270        }
271        if let Some(opened) = opening_fence(content) {
272            fence = Some(opened);
273            continue;
274        }
275        collect_links_on_line(line, re, &mut out);
276    }
277    out
278}
279
280/// Push every wiki-link target found on one line into `out` (trimmed,
281/// `.md`-stripped). Shared by the frontmatter and body scans in
282/// [`extract_link_targets`].
283fn collect_links_on_line(line: &str, re: &Regex, out: &mut Vec<PathBuf>) {
284    for cap in re.captures_iter(line) {
285        if let Some(m) = cap.get(1) {
286            let raw = m.as_str().trim();
287            out.push(strip_md(Path::new(raw)));
288        }
289    }
290}
291
292/// Split a file into `(frontmatter YAML, body)`. The frontmatter is the text
293/// between a leading `---` line (the very first line, the universal frontmatter
294/// contract) and its closing `---`; the body is everything after that closing
295/// fence. A file with no valid leading frontmatter block yields `(None, text)` —
296/// the whole text is body — so files with no frontmatter (and a literal `---`
297/// that is content, not a fence) keep their prior whole-file body scan.
298///
299/// Operates on byte offsets into the original `text` so both returned slices
300/// borrow it; the frontmatter slice is offset-equivalent to
301/// [`frontmatter_block`]'s string, and the body picks up immediately after the
302/// closing `---` line's newline.
303fn split_frontmatter_and_body(text: &str) -> (Option<&str>, &str) {
304    // Normalize away a leading BOM, but require `---` as the first line.
305    let stripped = text.strip_prefix('\u{feff}').unwrap_or(text);
306    // The opening fence must be the very first line: a line whose trimmed form is
307    // `---` (trailing whitespace tolerated, matching the prior `frontmatter_block`
308    // and the universal contract), followed by a line break (or EOF). A bare
309    // `---` body line that isn't the first line is not frontmatter.
310    let (first_line, after_first) = match stripped.find('\n') {
311        Some(nl) => (&stripped[..nl], &stripped[nl + 1..]),
312        None => (stripped, ""),
313    };
314    if first_line.trim_end_matches('\r').trim_end() != "---" {
315        return (None, text);
316    }
317    let after_open = after_first;
318    // Scan lines from just after the opening fence for the closing `---`.
319    let mut cursor = after_open;
320    let fm_start = after_open;
321    loop {
322        let (line, tail, had_newline) = match cursor.find('\n') {
323            Some(nl) => (&cursor[..nl], &cursor[nl + 1..], true),
324            None => (cursor, "", false),
325        };
326        if line.trim_end_matches('\r').trim_end() == "---" {
327            // Frontmatter is everything from `fm_start` up to (not including)
328            // this closing `---` line; the body is everything after it.
329            let fm_len = line_offset(fm_start, line);
330            return (Some(&fm_start[..fm_len]), tail);
331        }
332        if !had_newline {
333            // Reached EOF with no closing fence: not a valid frontmatter block.
334            return (None, text);
335        }
336        cursor = tail;
337    }
338}
339
340/// Byte offset of `line` within `base` (both borrow the same buffer). Used to
341/// recover the length of the frontmatter span from its first line and the
342/// closing-fence line.
343fn line_offset(base: &str, line: &str) -> usize {
344    line.as_ptr() as usize - base.as_ptr() as usize
345}
346
347/// If `line` opens a fenced code block, return its `(fence byte, run length)`.
348/// A fence is at least three backticks or tildes, with up to three leading
349/// spaces of indentation. Mirrors `render::opening_fence`.
350fn opening_fence(line: &str) -> Option<(u8, usize)> {
351    let indent = line.len() - line.trim_start_matches(' ').len();
352    if indent > 3 {
353        return None;
354    }
355    let rest = &line[indent..];
356    let byte = rest.bytes().next()?;
357    if byte != b'`' && byte != b'~' {
358        return None;
359    }
360    let run = rest.len() - rest.trim_start_matches(byte as char).len();
361    if run < 3 {
362        return None;
363    }
364    // A backtick fence's info string may not itself contain a backtick.
365    if byte == b'`' && rest[run..].contains('`') {
366        return None;
367    }
368    Some((byte, run))
369}
370
371/// True if `line` closes the currently open fence `(byte, len)`: same fence
372/// char, a run at least as long, and nothing else but trailing whitespace.
373/// Mirrors `render::is_closing_fence`.
374fn is_closing_fence(line: &str, fence: (u8, usize)) -> bool {
375    let (byte, open_len) = fence;
376    let indent = line.len() - line.trim_start_matches(' ').len();
377    if indent > 3 {
378        return false;
379    }
380    let rest = &line[indent..];
381    let run = rest.len() - rest.trim_start_matches(byte as char).len();
382    if run < open_len {
383        return false;
384    }
385    rest[run..].trim().is_empty()
386}
387
388/// Drop a trailing `.md` from a path, leaving everything else intact.
389fn strip_md(path: &Path) -> PathBuf {
390    let s = path.to_string_lossy();
391    match s.strip_suffix(".md") {
392        Some(stem) => PathBuf::from(stem),
393        None => path.to_path_buf(),
394    }
395}
396
397/// True if a wiki-link target is a full store-relative path: it has a path
398/// separator AND its first segment is a recognized layer (`sources`/`records`/
399/// `wiki`) with a non-empty remainder. Short-form targets like `sarah-chen`
400/// are false, and so are non-layer multi-segment targets like
401/// `contacts/sarah-chen` (a missing layer prefix). Doctrine: only true
402/// store-relative paths resolve to a node.
403///
404/// This mirrors `validate::is_full_store_path` so `stats.broken_link_count`
405/// agrees with `validate`'s `WIKI_LINK_BROKEN` total: a non-layer target like
406/// `[[contacts/sarah]]` is a short-form error in `validate` (never broken), and
407/// must likewise be excluded here rather than counted as a broken edge.
408fn is_full_path(target: &Path) -> bool {
409    let mut parts = target.components();
410    let first = match parts.next() {
411        Some(std::path::Component::Normal(s)) => s.to_string_lossy(),
412        _ => return false,
413    };
414    let has_rest = parts.next().is_some();
415    matches!(first.as_ref(), "sources" | "records") && has_rest
416}
417
418/// True if `target` stays inside the store: every component is `Normal` (a
419/// `CurDir` `.` is harmless and allowed), with no `..` (`ParentDir`), absolute
420/// (`RootDir`), or platform-prefix component. Mirrors
421/// `graph::is_within_store_target` and validate's `is_safe_store_relative_path`,
422/// so the containment decision is identical across the three surfaces. Used to
423/// gate any on-disk probe in [`target_resolves_on_disk`] before a `join`.
424fn is_within_store_target(target: &Path) -> bool {
425    target.components().all(|c| {
426        matches!(
427            c,
428            std::path::Component::Normal(_) | std::path::Component::CurDir
429        )
430    })
431}
432
433/// True if a full-path wiki-link `target` (already `.md`-stripped, store-
434/// relative) resolves to a real **non-`.md`** file on disk — a source artifact
435/// like a `.eml` or `.pdf` under `sources/`. Called only after the `.md` node
436/// set has already been checked, so this exists to reconcile stats with `graph`
437/// (which resolves on disk) and `validate`: a link to an existing source file
438/// is a live edge, never a broken link or an orphan-maker.
439///
440/// Two on-disk shapes are recognized, mirroring `graph::resolve_existing` plus
441/// the bare-stem case sources use:
442///
443/// - the target as written is itself a real file (`[[sources/emails/msg.eml]]`
444///   → `sources/emails/msg.eml`);
445/// - the target is a bare stem and a sibling file shares that stem with a
446///   non-`.md` extension (`[[sources/emails/msg]]` → `sources/emails/msg.eml`).
447///
448/// A bare `.md` target is *not* handled here (an existing `.md` file is already
449/// a node in `existing_nodes`); this is strictly the non-`.md` source case.
450///
451/// **Containment gate.** A target that escapes the store root (any `..`,
452/// absolute, or platform-prefix component) is never probed: it returns `false`
453/// before any `join`/`is_file`/`read_dir`, so `[[sources/../../secret]]` can
454/// never reach the filesystem as a live edge or existence oracle outside the
455/// store. This mirrors `graph::is_within_store_target` and validate's
456/// `is_safe_store_relative_path` (which reject `..` before any probe), keeping
457/// the broken-link surface in agreement: an escaping target is counted broken
458/// (validate's `WIKI_LINK_BROKEN`), never silently treated as resolved.
459fn target_resolves_on_disk(store_root: &Path, target: &Path) -> bool {
460    // Reject any non-`Normal` component (`..`, RootDir, Prefix) up front — never
461    // let a wiki-link turn a stats probe into a filesystem escape.
462    if !is_within_store_target(target) {
463        return false;
464    }
465    // The target as written points at a real file (e.g. an explicit `.eml`).
466    let literal = store_root.join(target);
467    if literal.is_file() {
468        return true;
469    }
470    // Bare-stem case: look for a sibling `<stem>.<ext>` with a non-`.md`
471    // extension in the target's parent directory. Restricted to the bare form
472    // (no extension on the target) so an explicit but missing `.pdf` link still
473    // reads as broken rather than silently matching a different file.
474    if target.extension().is_some() {
475        return false;
476    }
477    let stem = match target.file_name() {
478        Some(name) => name,
479        None => return false,
480    };
481    let parent_abs = store_root.join(match target.parent() {
482        Some(p) => p,
483        None => return false,
484    });
485    let entries = match std::fs::read_dir(&parent_abs) {
486        Ok(e) => e,
487        Err(_) => return false,
488    };
489    for entry in entries.flatten() {
490        let path = entry.path();
491        if !path.is_file() {
492            continue;
493        }
494        // Same stem, and an extension that is present and not `.md`.
495        if path.file_stem() == Some(stem) {
496            match path.extension().and_then(|e| e.to_str()) {
497                Some("md") | None => continue,
498                Some(_) => return true,
499            }
500        }
501    }
502    false
503}
504
505/// Read the `type:` value from a file's leading YAML frontmatter block, if the
506/// file has one. Returns `None` when there's no frontmatter or no `type` key.
507/// Self-contained (does not route through the crate's parser): split on the
508/// `---` fences, parse the block as a YAML mapping, read `type` as a string.
509fn parse_type(text: &str) -> Option<String> {
510    let yaml = frontmatter_block(text)?;
511    let value: serde_norway::Value = serde_norway::from_str(&yaml).ok()?;
512    let mapping = value.as_mapping()?;
513    let type_val = mapping.get(serde_norway::Value::String("type".to_string()))?;
514    let s = type_val.as_str()?.trim();
515    if s.is_empty() {
516        None
517    } else {
518        Some(s.to_string())
519    }
520}
521
522/// Extract the raw YAML between a leading `---` fence and its closing `---`.
523/// The opening fence must be the very first line of the file (the universal
524/// frontmatter contract: frontmatter is the first thing in the file). Delegates
525/// to [`split_frontmatter_and_body`] so the frontmatter boundary is computed in
526/// exactly one place (the type-parse and the link-scan never disagree on where
527/// frontmatter ends).
528fn frontmatter_block(text: &str) -> Option<String> {
529    split_frontmatter_and_body(text).0.map(str::to_string)
530}
531
532/// Sort a type distribution into the top `limit` types by count descending,
533/// ties broken by type name ascending.
534fn top_types(dist: &BTreeMap<String, usize>, limit: usize) -> Vec<(String, usize)> {
535    let mut pairs: Vec<(String, usize)> = dist.iter().map(|(k, v)| (k.clone(), *v)).collect();
536    // BTreeMap iteration is already name-ascending; a stable sort by count
537    // descending therefore yields (count desc, name asc).
538    pairs.sort_by_key(|p| std::cmp::Reverse(p.1));
539    pairs.truncate(limit);
540    pairs
541}
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546    use crate::parser::Config;
547    use std::fs;
548    use tempfile::TempDir;
549
550    /// Build a `Store` rooted at a fresh tempdir with an empty `DB.md` marker.
551    /// Bypasses `Store::open` by constructing the struct directly —
552    /// `stats::compute` only reads `store.root`.
553    fn temp_store() -> (TempDir, Store) {
554        let dir = TempDir::new().expect("tempdir");
555        fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
556        let store = Store {
557            root: dir.path().to_path_buf(),
558            config: Config::default(),
559        };
560        (dir, store)
561    }
562
563    /// Like [`temp_store`], but roots the store one level *inside* the tempdir
564    /// (`<tempdir>/store`) so `store.root.parent()` is the test's own private
565    /// tempdir rather than the shared OS temp root. Tests that plant a file
566    /// "above the store root" must use this — writing into `store.root.parent()`
567    /// of a top-level `TempDir` lands in `$TMPDIR`, which is shared across every
568    /// parallel test (and across test binaries under `cargo test --workspace`),
569    /// so two such tests collide on the same path and race.
570    fn temp_store_nested() -> (TempDir, Store) {
571        let dir = TempDir::new().expect("tempdir");
572        let root = dir.path().join("store");
573        fs::create_dir_all(&root).expect("create store root");
574        fs::write(root.join("DB.md"), "---\ntype: db-md\n---\n").expect("write DB.md");
575        let store = Store {
576            root,
577            config: Config::default(),
578        };
579        (dir, store)
580    }
581
582    /// Write a content file at a store-relative path, creating parent dirs.
583    fn write_rel(store: &Store, rel: &str, contents: &str) {
584        let abs = store.root.join(rel);
585        if let Some(parent) = abs.parent() {
586            fs::create_dir_all(parent).expect("mkdir parents");
587        }
588        fs::write(abs, contents).expect("write content file");
589    }
590
591    /// A minimal content file body: frontmatter with the given type, no links.
592    fn doc(type_: &str, summary: &str) -> String {
593        format!("---\ntype: {type_}\nsummary: \"{summary}\"\n---\n\nbody\n")
594    }
595
596    #[test]
597    fn empty_store_is_all_zeros() {
598        let (_d, store) = temp_store();
599        let s = compute(&store).expect("compute");
600        assert_eq!(s.total_files, 0);
601        assert_eq!(s.total_size_bytes, 0);
602        assert!(s.files_per_layer.is_empty());
603        assert!(s.type_distribution.is_empty());
604        assert_eq!(s.orphan_count, 0);
605        assert_eq!(s.broken_link_count, 0);
606        assert!(s.top_types.is_empty());
607    }
608
609    #[test]
610    fn counts_files_per_layer_and_total() {
611        let (_d, store) = temp_store();
612        write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
613        write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
614        write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
615        // A conclusion record (former wiki-page) lives in the records layer.
616        write_rel(&store, "records/profiles/p.md", &doc("profile", "p"));
617
618        let s = compute(&store).expect("compute");
619        assert_eq!(s.total_files, 4);
620        assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&2));
621        assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&2));
622    }
623
624    #[test]
625    fn ignores_meta_files_and_non_md_and_dotdirs_and_log() {
626        let (_d, store) = temp_store();
627        // Real content.
628        write_rel(&store, "records/contacts/real.md", &doc("contact", "real"));
629        // Meta + non-content that must NOT be counted.
630        write_rel(
631            &store,
632            "records/contacts/index.md",
633            "---\ntype: index\nscope: type-folder\n---\n",
634        );
635        write_rel(&store, "records/contacts/index.jsonl", "{}\n");
636        write_rel(&store, "records/notes.txt", "not markdown\n");
637        // `log/` archive tree under a layer is skipped wholesale.
638        write_rel(&store, "sources/log/2026-04.md", &doc("email", "archived"));
639        // Hidden dir contents are skipped.
640        write_rel(
641            &store,
642            "records/.obsidian/cache.md",
643            &doc("profile", "hidden"),
644        );
645
646        let s = compute(&store).expect("compute");
647        assert_eq!(s.total_files, 1, "only the one real content file counts");
648        assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&1));
649        assert_eq!(s.files_per_layer.get(&Layer::Sources), None);
650    }
651
652    #[test]
653    fn total_size_is_sum_of_content_file_bytes() {
654        let (_d, store) = temp_store();
655        let a = doc("email", "a");
656        let b = "---\ntype: contact\nsummary: x\n---\n\nlonger body text here\n".to_string();
657        write_rel(&store, "sources/emails/a.md", &a);
658        write_rel(&store, "records/contacts/b.md", &b);
659        // A skipped file's bytes must not be included.
660        write_rel(
661            &store,
662            "records/contacts/index.md",
663            "---\ntype: index\n---\nbig meta file padding padding\n",
664        );
665
666        let s = compute(&store).expect("compute");
667        let expected = a.len() as u64 + b.len() as u64;
668        assert_eq!(s.total_size_bytes, expected);
669    }
670
671    #[test]
672    fn type_distribution_counts_each_type_value() {
673        let (_d, store) = temp_store();
674        write_rel(&store, "sources/emails/a.md", &doc("email", "a"));
675        write_rel(&store, "sources/emails/b.md", &doc("email", "b"));
676        write_rel(&store, "sources/emails/c.md", &doc("email", "c"));
677        write_rel(&store, "records/contacts/d.md", &doc("contact", "d"));
678        write_rel(&store, "records/proposals/e.md", &doc("proposal", "e"));
679
680        let s = compute(&store).expect("compute");
681        assert_eq!(s.type_distribution.get("email"), Some(&3));
682        assert_eq!(s.type_distribution.get("contact"), Some(&1));
683        assert_eq!(s.type_distribution.get("proposal"), Some(&1));
684        assert_eq!(s.type_distribution.len(), 3);
685    }
686
687    #[test]
688    fn file_without_type_is_counted_in_totals_but_not_distribution() {
689        let (_d, store) = temp_store();
690        // A content file with frontmatter but no `type:` key.
691        write_rel(
692            &store,
693            "records/themes/x.md",
694            "---\nsummary: no type here\n---\n\nbody\n",
695        );
696        // A content file with no frontmatter at all.
697        write_rel(
698            &store,
699            "records/themes/y.md",
700            "just a body, no frontmatter\n",
701        );
702
703        let s = compute(&store).expect("compute");
704        assert_eq!(s.total_files, 2, "untyped files still count toward totals");
705        assert_eq!(s.files_per_layer.get(&Layer::Records), Some(&2));
706        assert!(
707            s.type_distribution.is_empty(),
708            "no type key => no distribution entry, not an empty-string bucket"
709        );
710    }
711
712    #[test]
713    fn top_types_orders_by_count_desc_then_name_asc() {
714        let (_d, store) = temp_store();
715        // contact x3, email x3 (tie), decision x1.
716        write_rel(&store, "records/contacts/c1.md", &doc("contact", "1"));
717        write_rel(&store, "records/contacts/c2.md", &doc("contact", "2"));
718        write_rel(&store, "records/contacts/c3.md", &doc("contact", "3"));
719        write_rel(&store, "sources/emails/e1.md", &doc("email", "1"));
720        write_rel(&store, "sources/emails/e2.md", &doc("email", "2"));
721        write_rel(&store, "sources/emails/e3.md", &doc("email", "3"));
722        write_rel(&store, "records/decisions/d1.md", &doc("decision", "1"));
723
724        let s = compute(&store).expect("compute");
725        assert_eq!(
726            s.top_types,
727            vec![
728                ("contact".to_string(), 3),
729                ("email".to_string(), 3),
730                ("decision".to_string(), 1),
731            ],
732            "ties (contact, email both 3) break by name ascending; decision trails"
733        );
734    }
735
736    #[test]
737    fn top_types_is_capped_at_ten() {
738        let (_d, store) = temp_store();
739        // 12 distinct custom types, each one file.
740        for i in 0..12 {
741            let t = format!("type{i:02}");
742            write_rel(&store, &format!("records/{t}/f.md"), &doc(&t, "x"));
743        }
744        let s = compute(&store).expect("compute");
745        assert_eq!(s.top_types.len(), 10, "top_types caps at 10");
746        assert_eq!(
747            s.type_distribution.len(),
748            12,
749            "distribution keeps all types"
750        );
751    }
752
753    #[test]
754    fn orphans_are_files_with_no_incoming_and_no_outgoing_links() {
755        let (_d, store) = temp_store();
756        // a -> b (a has outgoing, b has incoming). c is isolated => orphan.
757        write_rel(
758            &store,
759            "records/contacts/a.md",
760            "---\ntype: contact\nsummary: a\n---\n\nSee [[records/contacts/b]].\n",
761        );
762        write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
763        write_rel(&store, "records/contacts/c.md", &doc("contact", "c"));
764
765        let s = compute(&store).expect("compute");
766        assert_eq!(s.orphan_count, 1, "only c is an orphan");
767    }
768
769    #[test]
770    fn a_file_with_only_a_self_link_is_an_orphan_matching_graph() {
771        let (_d, store) = temp_store();
772        // A file that links only to ITSELF has no real graph edge, so it must be
773        // an orphan — consistent with `graph::orphans` (which skips self-links).
774        write_rel(
775            &store,
776            "records/contacts/solo.md",
777            "---\ntype: contact\nsummary: solo\n---\n\nSee [[records/contacts/solo]].\n",
778        );
779        let s = compute(&store).expect("compute");
780        assert_eq!(
781            s.orphan_count, 1,
782            "a self-only-linking file is an orphan: {s:?}"
783        );
784    }
785
786    #[test]
787    fn a_file_with_only_an_incoming_link_is_not_an_orphan() {
788        let (_d, store) = temp_store();
789        // b has no outgoing links, but a links to it => b is NOT an orphan.
790        // a itself has an outgoing link => also not an orphan. Zero orphans.
791        write_rel(
792            &store,
793            "records/profiles/a.md",
794            "---\ntype: profile\nsummary: a\n---\n\n[[records/profiles/b]]\n",
795        );
796        write_rel(&store, "records/profiles/b.md", &doc("profile", "b"));
797
798        let s = compute(&store).expect("compute");
799        assert_eq!(s.orphan_count, 0);
800    }
801
802    #[test]
803    fn frontmatter_wiki_links_count_as_edges_for_orphans() {
804        let (_d, store) = temp_store();
805        // The link lives in a frontmatter field, not the body. It must still
806        // wire `contact` -> `company`, so neither is an orphan.
807        write_rel(
808            &store,
809            "records/contacts/sarah.md",
810            "---\ntype: contact\nsummary: s\ncompany: [[records/companies/acme]]\n---\n\nbody\n",
811        );
812        write_rel(&store, "records/companies/acme.md", &doc("company", "acme"));
813
814        let s = compute(&store).expect("compute");
815        assert_eq!(
816            s.orphan_count, 0,
817            "a frontmatter wiki-link is a real edge; neither endpoint is orphaned"
818        );
819    }
820
821    #[test]
822    fn broken_links_count_targets_that_do_not_exist() {
823        let (_d, store) = temp_store();
824        // Two links: one to an existing file, one to a missing file.
825        write_rel(
826            &store,
827            "records/profiles/a.md",
828            "---\ntype: profile\nsummary: a\n---\n\n[[records/profiles/b]] and [[records/contacts/ghost]]\n",
829        );
830        write_rel(&store, "records/profiles/b.md", &doc("profile", "b"));
831
832        let s = compute(&store).expect("compute");
833        assert_eq!(s.broken_link_count, 1, "only the ghost target is broken");
834    }
835
836    #[test]
837    fn broken_link_resolves_with_md_extension_stripped() {
838        let (_d, store) = temp_store();
839        // Link written WITH a `.md` extension still resolves to the real file
840        // (the parser accepts `.md`; validate only warns). Not broken.
841        write_rel(
842            &store,
843            "records/profiles/a.md",
844            "---\ntype: profile\nsummary: a\n---\n\n[[records/profiles/b.md]]\n",
845        );
846        write_rel(&store, "records/profiles/b.md", &doc("profile", "b"));
847
848        let s = compute(&store).expect("compute");
849        assert_eq!(
850            s.broken_link_count, 0,
851            "a `.md`-suffixed target resolves to the same node and is not broken"
852        );
853    }
854
855    #[test]
856    fn short_form_links_are_not_broken_and_do_not_wire_the_graph() {
857        let (_d, store) = temp_store();
858        // `[[b]]` is a short-form (no `/`): a validation error elsewhere, but
859        // for stats it neither counts as broken (it doesn't resolve to a node)
860        // nor wires `a` into the graph. So `a` (no other links) is an orphan.
861        write_rel(
862            &store,
863            "records/contacts/a.md",
864            "---\ntype: contact\nsummary: a\n---\n\n[[b]]\n",
865        );
866        write_rel(&store, "records/contacts/b.md", &doc("contact", "b"));
867
868        let s = compute(&store).expect("compute");
869        assert_eq!(
870            s.broken_link_count, 0,
871            "short-form links are not counted as broken by stats"
872        );
873        // a has only a short-form link (not an edge) => orphan. b has no links
874        // and no real incoming edge => orphan. Both orphaned.
875        assert_eq!(s.orphan_count, 2);
876    }
877
878    #[test]
879    fn display_alias_links_resolve_to_the_target_not_the_alias() {
880        let (_d, store) = temp_store();
881        // `[[records/profiles/b|Bob]]` targets b, displays "Bob". The alias must
882        // be stripped: the edge goes to b (exists), so it's not broken and b is
883        // not an orphan.
884        write_rel(
885            &store,
886            "records/profiles/a.md",
887            "---\ntype: profile\nsummary: a\n---\n\nmet [[records/profiles/b|Bob]] today\n",
888        );
889        write_rel(&store, "records/profiles/b.md", &doc("profile", "b"));
890
891        let s = compute(&store).expect("compute");
892        assert_eq!(s.broken_link_count, 0, "alias target resolves and exists");
893        assert_eq!(s.orphan_count, 0, "a links out, b is linked to");
894    }
895
896    #[test]
897    fn duplicate_links_in_one_file_count_broken_per_occurrence() {
898        let (_d, store) = temp_store();
899        // The same missing target twice => two broken-link occurrences.
900        write_rel(
901            &store,
902            "records/profiles/a.md",
903            "---\ntype: profile\nsummary: a\n---\n\n[[records/contacts/ghost]] [[records/contacts/ghost]]\n",
904        );
905        let s = compute(&store).expect("compute");
906        assert_eq!(
907            s.broken_link_count, 2,
908            "broken links count occurrences, not distinct targets"
909        );
910    }
911
912    #[test]
913    fn markdown_links_are_not_treated_as_wiki_links() {
914        let (_d, store) = temp_store();
915        // A standard markdown link to an external URL must not register as a
916        // wiki edge (so this file stays an orphan) nor as a broken link.
917        write_rel(
918            &store,
919            "records/profiles/a.md",
920            "---\ntype: profile\nsummary: a\n---\n\nSee [Acme](https://acme.io/path).\n",
921        );
922        let s = compute(&store).expect("compute");
923        assert_eq!(s.broken_link_count, 0, "markdown links aren't graph edges");
924        assert_eq!(s.orphan_count, 1, "the file has no wiki-links => orphan");
925    }
926
927    #[test]
928    fn regression_non_layer_multi_segment_link_is_not_broken() {
929        // Finding #20: a target like `[[contacts/sarah-chen]]` omits the layer
930        // prefix. It has a `/` but its first segment (`contacts`) is not a
931        // recognized layer, so it's a short-form error in `validate`, NOT a
932        // broken link. stats must agree: it counts neither as broken nor as an
933        // outgoing edge. Pre-fix `is_full_path` (components().count() > 1)
934        // accepted it and reported broken_link_count = 1.
935        let (_d, store) = temp_store();
936        write_rel(
937            &store,
938            "records/contacts/a.md",
939            "---\ntype: contact\nsummary: a\n---\n\nSee [[contacts/sarah-chen]].\n",
940        );
941        let s = compute(&store).expect("compute");
942        assert_eq!(
943            s.broken_link_count, 0,
944            "a non-layer multi-segment target is a short-form error, not broken"
945        );
946        // The non-layer link is not a graph edge, so `a` has no outgoing edge
947        // and is an orphan — matching how validate/graph treat it.
948        assert_eq!(
949            s.orphan_count, 1,
950            "the non-layer link does not wire `a` out of orphan status"
951        );
952    }
953
954    #[test]
955    fn regression_wiki_links_in_code_fences_are_ignored() {
956        // Finding #21: a wiki-link that appears only inside a fenced code block
957        // is illustrative syntax, not a graph edge. validate skips fenced
958        // regions; stats must too. Pre-fix the regex ran over the whole file
959        // with no fence tracking, so the fenced ghost link inflated
960        // broken_link_count to 1 and the fenced real link un-orphaned the page.
961        let (_d, store) = temp_store();
962        // A howto page whose ONLY wiki-links live inside ``` and ~~~ fences:
963        // one to a missing target, one to an existing target.
964        write_rel(
965            &store,
966            "records/synthesis/howto.md",
967            "---\ntype: synthesis\nsummary: howto\n---\n\
968             \nWrite links like this:\n\
969             \n```\n[[records/contacts/ghost]]\n```\n\
970             \nor this:\n\
971             \n~~~\n[[records/synthesis/real]]\n~~~\n",
972        );
973        write_rel(
974            &store,
975            "records/synthesis/real.md",
976            &doc("synthesis", "real"),
977        );
978        let s = compute(&store).expect("compute");
979        assert_eq!(
980            s.broken_link_count, 0,
981            "a `[[...]]` inside a code fence is not a real (broken) edge"
982        );
983        // howto has no real edges => orphan. real is not linked-to by any real
984        // edge => orphan. Both orphaned (2), proving the fenced link to `real`
985        // did not wire either file out of orphan status.
986        assert_eq!(
987            s.orphan_count, 2,
988            "fenced wiki-links do not wire files out of orphan status: {s:?}"
989        );
990    }
991
992    #[test]
993    fn a_link_to_an_existing_file_in_another_layer_resolves() {
994        let (_d, store) = temp_store();
995        // A records-layer profile links to a source file in the other layer;
996        // cross-layer full-path links resolve like any other.
997        write_rel(
998            &store,
999            "records/profiles/a.md",
1000            "---\ntype: profile\nsummary: a\n---\n\nfrom [[sources/emails/2026/05/m]]\n",
1001        );
1002        write_rel(&store, "sources/emails/2026/05/m.md", &doc("email", "m"));
1003
1004        let s = compute(&store).expect("compute");
1005        assert_eq!(s.broken_link_count, 0);
1006        assert_eq!(s.orphan_count, 0, "both endpoints are wired");
1007    }
1008
1009    #[test]
1010    fn regression_tilde_line_inside_backtick_fence_does_not_invert_state() {
1011        // Finding #44/#11: a `~~~` line inside an open ``` fence (or any inner
1012        // fence of the other char / a shorter run) must NOT close the block.
1013        // Pre-fix a single boolean toggled on it, inverting fence state so the
1014        // fenced ghost link counted broken and the real link after the fence
1015        // was dropped. With (byte, run-length) tracking the block only closes on
1016        // a matching ``` fence.
1017        let (_d, store) = temp_store();
1018        write_rel(&store, "records/profiles/bob.md", &doc("profile", "bob"));
1019        // ```text … ~~~ x (inner tilde line) … [[ghost]] … ``` then a real link.
1020        write_rel(
1021            &store,
1022            "records/concepts/howto.md",
1023            "---\ntype: concept\nsummary: howto\n---\n\
1024             \n```text\n~~~ x\n[[records/profiles/ghost]]\n```\n\
1025             \nReal: [[records/profiles/bob]]\n",
1026        );
1027
1028        let s = compute(&store).expect("compute");
1029        assert_eq!(
1030            s.broken_link_count, 0,
1031            "the fenced ghost link is inside the unbroken ``` block, not broken: {s:?}"
1032        );
1033        // bob is linked from howto (a real edge after the fence closes), and
1034        // howto links out — neither is an orphan.
1035        assert_eq!(
1036            s.orphan_count, 0,
1037            "the real post-fence link wires both files: {s:?}"
1038        );
1039    }
1040
1041    #[test]
1042    fn regression_frontmatter_code_fence_does_not_swallow_body_links() {
1043        // A stray code-fence line INSIDE the frontmatter (here an unbalanced ```
1044        // in a YAML block-scalar value) must not leak fenced-code state into the
1045        // body scan. Pre-fix `extract_link_targets` scanned the whole file with a
1046        // single fence tracker, so the frontmatter ``` opened a fence that
1047        // swallowed every later body `[[...]]`, dropping the real edge and
1048        // mis-marking both endpoints as orphans. The fix splits frontmatter from
1049        // body and starts the body fence tracking fresh.
1050        let (_d, store) = temp_store();
1051        write_rel(
1052            &store,
1053            "records/contacts/alice.md",
1054            &doc("contact", "alice"),
1055        );
1056        // A profile whose frontmatter carries a wiki-link field AND an unbalanced
1057        // ``` line, then a REAL body link plus a genuinely-fenced body link.
1058        write_rel(
1059            &store,
1060            "records/profiles/note.md",
1061            "---\ntype: profile\nsummary: note\n\
1062             refs: \"[[records/contacts/alice]]\"\n\
1063             field: |\n  start of a fence\n  ```\n  never closed in frontmatter\n---\n\
1064             \nReal: [[records/contacts/alice]]\n\
1065             \n```\n[[records/contacts/ghost]]\n```\n",
1066        );
1067
1068        let s = compute(&store).expect("compute");
1069        // The frontmatter ``` no longer hides the body link to alice, and the
1070        // body-fenced ghost link is still ignored (not broken). If the leak
1071        // returned, the body link would be dropped and the ghost would surface.
1072        assert_eq!(
1073            s.broken_link_count, 0,
1074            "the genuinely body-fenced ghost link is not a broken edge, \
1075             and the frontmatter fence did not surface it: {s:?}"
1076        );
1077        // alice is linked from note (via both the frontmatter `refs:` edge and
1078        // the real body link), and note links out — neither is an orphan. Pre-fix
1079        // the frontmatter fence dropped the body link and the orphan count was 2.
1080        assert_eq!(
1081            s.orphan_count, 0,
1082            "the body link survives the frontmatter code fence and wires both files: {s:?}"
1083        );
1084    }
1085
1086    #[test]
1087    fn regression_nested_log_directory_is_counted_not_skipped() {
1088        // Finding #45: only the layer's IMMEDIATE `log/` archive is skipped. A
1089        // directory named `log` nested under a type-folder is ordinary content
1090        // and must be counted, matching tree/index/query. Pre-fix any `log` dir
1091        // at any depth was pruned, making the whole subtree invisible to stats.
1092        let (_d, store) = temp_store();
1093        write_rel(
1094            &store,
1095            "sources/emails/log/maillog.md",
1096            &doc(
1097                "email",
1098                "an archived mail log entry under a log subdirectory",
1099            ),
1100        );
1101        // The layer-immediate `log/` archive is still skipped.
1102        write_rel(&store, "sources/log/2026-04.md", &doc("email", "rotated"));
1103
1104        let s = compute(&store).expect("compute");
1105        assert_eq!(
1106            s.total_files, 1,
1107            "the nested sources/emails/log file counts; the layer-immediate sources/log is skipped: {s:?}"
1108        );
1109        assert_eq!(s.files_per_layer.get(&Layer::Sources), Some(&1));
1110        assert_eq!(s.type_distribution.get("email"), Some(&1));
1111    }
1112
1113    #[test]
1114    fn regression_link_to_existing_non_md_source_is_a_live_edge() {
1115        // Finding (high): a record that wiki-links to an existing non-`.md`
1116        // source artifact (a `.eml`) must read as a LIVE edge, not broken, and
1117        // the record is not an orphan. `sources/` holds such files by design.
1118        let (_d, store) = temp_store();
1119        // A real .eml source file (not a .md content file).
1120        write_rel(
1121            &store,
1122            "sources/emails/msg.eml",
1123            "From: someone@example.com\nSubject: Renewal\n\nBody text.\n",
1124        );
1125        // A record with the SPEC-canonical bare link to that source.
1126        write_rel(
1127            &store,
1128            "records/contacts/sarah.md",
1129            "---\ntype: contact\nsummary: s\n---\n\nLinked source: [[sources/emails/msg]]\n",
1130        );
1131
1132        let s = compute(&store).expect("compute");
1133        assert_eq!(
1134            s.broken_link_count, 0,
1135            "a link to an existing .eml source is live, not broken: {s:?}"
1136        );
1137        assert_eq!(
1138            s.orphan_count, 0,
1139            "the linking record has a resolvable outgoing edge to the source: {s:?}"
1140        );
1141        // The explicit-extension form resolves the same way.
1142        write_rel(
1143            &store,
1144            "records/contacts/sarah.md",
1145            "---\ntype: contact\nsummary: s\n---\n\nLinked source: [[sources/emails/msg.eml]]\n",
1146        );
1147        let s2 = compute(&store).expect("compute");
1148        assert_eq!(s2.broken_link_count, 0, "explicit .eml target resolves too");
1149        assert_eq!(s2.orphan_count, 0);
1150    }
1151
1152    #[test]
1153    fn regression_traversal_target_is_broken_not_a_filesystem_escape() {
1154        // SECURITY regression: a `..`-laden wiki-link target must never turn a
1155        // stats probe into a read of a file OUTSIDE the store. Pre-fix
1156        // `target_resolves_on_disk` joined the raw target onto the store root and
1157        // probed `is_file` / `read_dir` with no containment check, so
1158        // `[[sources/../../outside-secret]]` reached a file above the store and
1159        // was silently counted as a LIVE edge (un-orphaning the linker and never
1160        // counted broken) — diverging from validate (which flags it
1161        // WIKI_LINK_BROKEN) and graph (which drops it). The gate now rejects any
1162        // non-`Normal` component before any join, so it counts broken.
1163        // Nested store: `store.root.parent()` is this test's private tempdir,
1164        // never the shared `$TMPDIR` (which the sibling traversal test would also
1165        // write into, racing on the same filename under `--workspace`).
1166        let (_d, store) = temp_store_nested();
1167        // Every store has a `sources/` dir; the traversal needs its first
1168        // component to be a recognized layer to pass `is_full_path`.
1169        fs::create_dir_all(store.root.join("sources/emails")).unwrap();
1170        // Plant a secret ABOVE the store root (the parent of the store dir).
1171        let outside_dir = store.root.parent().expect("store has a parent");
1172        fs::write(outside_dir.join("outside-secret.txt"), "TOP SECRET\n").unwrap();
1173
1174        // Bare-stem traversal (would hit the `read_dir` parent branch) and the
1175        // explicit-extension traversal (would hit the `is_file` literal branch).
1176        for target in [
1177            "sources/../../outside-secret",
1178            "sources/../../outside-secret.txt",
1179        ] {
1180            write_rel(
1181                &store,
1182                "records/contacts/a.md",
1183                &format!("---\ntype: contact\nsummary: s\n---\n\nEscape: [[{target}]]\n"),
1184            );
1185            let s = compute(&store).expect("compute");
1186            assert_eq!(
1187                s.broken_link_count, 1,
1188                "a `..` target escaping the store must be broken, not a live edge ({target}): {s:?}"
1189            );
1190            assert_eq!(
1191                s.orphan_count, 1,
1192                "an escaping link must NOT wire the linker out of orphan status ({target}): {s:?}"
1193            );
1194        }
1195        // The secret outside the store is untouched (we never followed the link).
1196        assert_eq!(
1197            fs::read_to_string(outside_dir.join("outside-secret.txt")).unwrap(),
1198            "TOP SECRET\n"
1199        );
1200    }
1201
1202    #[test]
1203    fn regression_target_resolves_on_disk_rejects_traversal_before_any_probe() {
1204        // SECURITY regression at the helper level: `target_resolves_on_disk`
1205        // must return `false` for any `..`-laden / absolute / prefix target
1206        // BEFORE it joins, `is_file`s, or `read_dir`s — so a wiki-link can never
1207        // turn a stats existence-probe into a read of a file OUTSIDE the store.
1208        // Pre-fix the helper joined the raw target onto the store root with no
1209        // containment gate, so a real file above the store made it return
1210        // `true`. This asserts the gate directly on the helper (the end-to-end
1211        // `compute()` path is covered separately above), exercising BOTH on-disk
1212        // branches: the literal `is_file` branch (explicit extension) and the
1213        // bare-stem `read_dir` branch.
1214        // Nested store: `store.root.parent()` is this test's private tempdir, so
1215        // the "above the store" files below never land in the shared `$TMPDIR`
1216        // and can never collide with the sibling traversal test's identically
1217        // named planted files when both run in parallel.
1218        let (_d, store) = temp_store_nested();
1219        // A real `sources/` tree exists (the literal/parent joins would have
1220        // something to land near), matching a real store.
1221        fs::create_dir_all(store.root.join("sources/emails")).unwrap();
1222        // Plant matching files ABOVE the store root: one with the exact name the
1223        // explicit-extension target points at, and one whose stem the bare-stem
1224        // target would discover via `read_dir` of the (escaped) parent dir.
1225        let outside_dir = store.root.parent().expect("store has a parent");
1226        fs::write(outside_dir.join("outside-secret.txt"), "TOP SECRET\n").unwrap();
1227        fs::write(outside_dir.join("outside-secret.eml"), "secret mail\n").unwrap();
1228
1229        // Explicit-extension traversal -> would hit the literal `is_file` branch.
1230        assert!(
1231            !target_resolves_on_disk(
1232                &store.root,
1233                &strip_md(Path::new("sources/../../outside-secret.txt"))
1234            ),
1235            "an explicit-extension `..` target escaping the store must not resolve on disk"
1236        );
1237        // Bare-stem traversal -> would hit the `read_dir(parent)` branch, where a
1238        // sibling `outside-secret.eml` (non-`.md`) sits beside the escaped parent.
1239        assert!(
1240            !target_resolves_on_disk(
1241                &store.root,
1242                &strip_md(Path::new("sources/../../outside-secret"))
1243            ),
1244            "a bare-stem `..` target escaping the store must not resolve on disk"
1245        );
1246        // A `..` that stays nominally under a layer prefix is still an escape and
1247        // is rejected before any probe.
1248        assert!(
1249            !target_resolves_on_disk(&store.root, Path::new("records/../records/secret")),
1250            "any `..` component is rejected before a probe, even one re-entering a layer"
1251        );
1252
1253        // Sanity: a legitimate in-store non-`.md` source DOES still resolve, so
1254        // the gate did not over-reject and break the finding #117 behavior.
1255        write_rel(
1256            &store,
1257            "sources/emails/msg.eml",
1258            "From: a@b.com\nSubject: x\n\nbody\n",
1259        );
1260        assert!(
1261            target_resolves_on_disk(&store.root, Path::new("sources/emails/msg")),
1262            "a legitimate in-store bare-stem source link still resolves on disk"
1263        );
1264
1265        // The secrets outside the store are untouched (we never followed a link).
1266        assert_eq!(
1267            fs::read_to_string(outside_dir.join("outside-secret.txt")).unwrap(),
1268            "TOP SECRET\n"
1269        );
1270    }
1271
1272    #[test]
1273    fn regression_link_to_truly_missing_source_is_still_broken() {
1274        // Guard the source-resolution fix doesn't over-resolve: a bare link
1275        // whose target has NO file of any extension on disk is still broken.
1276        let (_d, store) = temp_store();
1277        write_rel(
1278            &store,
1279            "records/contacts/sarah.md",
1280            "---\ntype: contact\nsummary: s\n---\n\nLinked: [[sources/emails/missing]]\n",
1281        );
1282        let s = compute(&store).expect("compute");
1283        assert_eq!(
1284            s.broken_link_count, 1,
1285            "a target with no on-disk file in any form is broken: {s:?}"
1286        );
1287    }
1288}