Skip to main content

dbmd_core/
store.rs

1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (a single
12//! presence-only content scan) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18//!
19//! Link edges are defined once, here, by the shared [`extract_edge_targets`] /
20//! [`canonical_link_target`] / [`link_edge_key`] helpers (fence-aware,
21//! whitespace-trimmed, case-folded to the filesystem), so the forward view
22//! (`graph::forwardlinks`), the backward view ([`Store::find_links_to_any`]),
23//! `rename`, and `validate` all agree on exactly which `[[...]]` is an edge.
24//! [`ensure_path_within_store`] is the within-store containment gate every
25//! caller-influenced path passes through before it is read or traversed.
26
27use std::collections::BTreeMap;
28use std::path::{Path, PathBuf};
29use std::time::{SystemTime, UNIX_EPOCH};
30
31use chrono::{DateTime, Datelike, FixedOffset};
32use ignore::WalkBuilder;
33
34use crate::index::IndexRecord;
35use crate::parser::{parse_db_md, Config, Frontmatter};
36
37/// Basenames that are never content files: the config marker and the two
38/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
39/// content layers never mistakes a catalog for a record.
40///
41/// Only `index.md` is excluded by basename, because the content walks traverse
42/// the layer dirs (`sources/`/`records/`) and `index.md` is the only
43/// meta file that appears INSIDE them. The root `DB.md` / `log.md` (and the
44/// `log/` archive) live at the store root, outside every layer, so they are
45/// never reached by these walks — and a content file that merely happens to be
46/// named `DB.md` or `log.md` inside a layer (e.g. `records/docs/DB.md`) is real
47/// content the SPEC does NOT reserve at type-folder depth.
48const NON_CONTENT_BASENAMES: [&str; 1] = ["index.md"];
49
50/// The complete machine-twin sidecar that backs every structured read.
51const TYPE_INDEX_FILE: &str = "index.jsonl";
52
53/// Returned when a path is opened as a store but has no `DB.md` at its root.
54/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
55#[derive(Debug, thiserror::Error)]
56#[error("not a db.md store: {path} has no DB.md")]
57pub struct NotAStore {
58    /// The path that was inspected.
59    pub path: PathBuf,
60}
61
62/// Errors from store-level operations (walk, locate, shard, sidecar read).
63#[derive(Debug, thiserror::Error)]
64pub enum StoreError {
65    /// A sidecar `index.jsonl` could not be read or parsed.
66    #[error("failed to read type index {path}: {message}")]
67    BadTypeIndex {
68        /// The sidecar file.
69        path: PathBuf,
70        /// What went wrong.
71        message: String,
72    },
73
74    /// A required date field for sharding was absent or unparseable, and there
75    /// was no usable fallback.
76    #[error("cannot compute shard path for {file}: no usable date field")]
77    NoShardDate {
78        /// The file being placed.
79        file: PathBuf,
80    },
81
82    /// An embedded-ripgrep scan failed to start or run.
83    #[error("search failed under {root}: {message}")]
84    Search {
85        /// The root the scan ran under.
86        root: PathBuf,
87        /// What went wrong.
88        message: String,
89    },
90
91    /// An underlying I/O failure.
92    #[error(transparent)]
93    Io(#[from] std::io::Error),
94}
95
96/// The three canonical layers of a db.md store.
97///
98/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
99/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
100/// declaration order (`Sources` < `Records`) is the sort order.
101#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
102pub enum Layer {
103    /// `sources/` — raw evidence (documentary + testimonial); immutable; date-sharded at scale.
104    Sources,
105    /// `records/` — everything the agent authors; meta-typed fact/operational/conclusion; entity types flat, event types sharded.
106    Records,
107}
108
109impl Layer {
110    /// The on-disk folder name for this layer (`"sources"` / `"records"`).
111    pub fn dir_name(self) -> &'static str {
112        match self {
113            Layer::Sources => "sources",
114            Layer::Records => "records",
115        }
116    }
117
118    /// Parse a layer from its folder name; `None` for anything else.
119    pub fn from_dir_name(name: &str) -> Option<Self> {
120        match name {
121            "sources" => Some(Layer::Sources),
122            "records" => Some(Layer::Records),
123            _ => None,
124        }
125    }
126
127    /// Every layer, in canonical order.
128    pub fn all() -> [Layer; 2] {
129        [Layer::Sources, Layer::Records]
130    }
131}
132
133/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
134///
135/// Construct via [`Store::open`]; that is the only path in, and it validates
136/// the `DB.md` marker so downstream code can assume a real store.
137#[derive(Debug, Clone)]
138pub struct Store {
139    /// The store root (the directory containing `DB.md`).
140    pub root: PathBuf,
141    /// The parsed `DB.md` config (agent instructions, policies, schemas).
142    pub config: Config,
143}
144
145impl Store {
146    /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
147    /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
148    /// count (the lowercase name refers to the project/spec, not the marker).
149    pub fn is_db_md_store(path: &Path) -> bool {
150        // Read the directory and match the *stored* filename byte-for-byte.
151        // `path.join("DB.md").exists()` would lie on a case-insensitive
152        // filesystem (macOS default), where a lowercase `db.md` answers a
153        // `DB.md` probe. `read_dir` returns the real on-disk name, so the
154        // exact-match check is correct on both case-sensitive (Linux) and
155        // case-insensitive filesystems.
156        let entries = match std::fs::read_dir(path) {
157            Ok(entries) => entries,
158            Err(_) => return false,
159        };
160        for entry in entries.flatten() {
161            if entry.file_name() == "DB.md" {
162                // A directory literally named `DB.md` is not the marker.
163                match entry.file_type() {
164                    Ok(ft) if ft.is_dir() => return false,
165                    Ok(_) => return true,
166                    Err(_) => return false,
167                }
168            }
169        }
170        false
171    }
172
173    /// Open `path` as a db.md store and require `DB.md` to be readable and
174    /// parseable. Normal commands should enter through this strict gate so a
175    /// damaged config cannot silently disable schema or policy rules.
176    pub fn open_strict(path: &Path) -> crate::Result<Store> {
177        if !Store::is_db_md_store(path) {
178            return Err(NotAStore {
179                path: path.to_path_buf(),
180            }
181            .into());
182        }
183        let db_md = path.join("DB.md");
184        let text = std::fs::read_to_string(&db_md)?;
185        let config = parse_db_md(&text, &db_md)?;
186        Ok(Store {
187            root: path.to_path_buf(),
188            config,
189        })
190    }
191
192    /// Open `path` as a db.md store: confirm the `DB.md` marker (else
193    /// [`NotAStore`]) and parse the `DB.md` config when possible. This is the
194    /// lenient validation-oriented open path: a damaged `DB.md` still marks the
195    /// directory as a store so `dbmd validate` can report the config error as an
196    /// issue. Normal CLI commands should use [`Store::open_strict`] instead.
197    pub fn open(path: &Path) -> Result<Store, NotAStore> {
198        if !Store::is_db_md_store(path) {
199            return Err(NotAStore {
200                path: path.to_path_buf(),
201            });
202        }
203        let db_md = path.join("DB.md");
204        // The marker exists; parse its config. A read or parse failure leaves
205        // the store openable with default config rather than masquerading as
206        // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
207        // DB.md is `dbmd validate`'s job to report, not `open`'s.
208        let config = match std::fs::read_to_string(&db_md) {
209            Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
210            Err(_) => Config::default(),
211        };
212        Ok(Store {
213            root: path.to_path_buf(),
214            config,
215        })
216    }
217
218    /// **SWEEP.** Recursively iterate every `.md` content file across
219    /// `sources/` and `records/`, skipping hidden dirs and `log/`.
220    /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
221    /// the interactive loop.
222    pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
223        // Only the three content layers — never root meta files (`DB.md`,
224        // `index.md`, `log.md`) and never `log/`, which live at root and are
225        // outside every layer dir.
226        let mut out = Vec::new();
227        for layer in Layer::all() {
228            out.extend(self.walk_layer(layer)?);
229        }
230        out.sort();
231        Ok(out)
232    }
233
234    /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
235    pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
236        let layer_root = self.root.join(layer.dir_name());
237        if !layer_root.is_dir() {
238            return Ok(Vec::new());
239        }
240        self.walk_content_md(&layer_root)
241    }
242
243    /// Enumerate every `.md` file in a single type-folder, **recursing through
244    /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
245    /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
246    pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
247        let abs = self.resolve_under_root(type_folder);
248        if !abs.is_dir() {
249            return Ok(Vec::new());
250        }
251        self.walk_content_md(&abs)
252    }
253
254    /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
255    /// (descending), ties broken by store-relative path (ascending) — a total
256    /// order, so write-through and rebuild never disagree on #500 vs #501.
257    ///
258    /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
259    /// `index rebuild`. The write-through path never calls this. The
260    /// cap-selection primitive for the 500-entry `index.md` browse view.
261    pub fn recent_in_type_folder(
262        &self,
263        type_folder: &Path,
264        n: usize,
265    ) -> Result<Vec<PathBuf>, StoreError> {
266        let files = self.walk_type_folder(type_folder)?;
267        // (updated, rel-path) for each file. Files missing/unparseable
268        // `updated` sort *after* dated ones (None last), then by path — so they
269        // are deterministically the lowest-priority candidates for the cap, not
270        // dropped silently. The total order (updated desc, path asc) is what
271        // keeps write-through and rebuild agreeing on #500 vs #501.
272        let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
273            .into_iter()
274            .map(|rel| {
275                let updated = self.read_updated(&self.abs_path(&rel));
276                (updated, rel)
277            })
278            .collect();
279        keyed.sort_by(|a, b| {
280            // `updated` descending: newest first. `None` is treated as the
281            // oldest possible, so dated files always win a cap slot over
282            // undated ones.
283            let by_updated = b.0.cmp(&a.0);
284            by_updated.then_with(|| a.1.cmp(&b.1))
285        });
286        keyed.truncate(n);
287        Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
288    }
289
290    /// The shard/flat predicate: true if the type date-shards, false if it
291    /// stays flat. True for source types and event record types
292    /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
293    /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
294    /// dedup-bounded entity types (`contact`/`company`/`decision`) and
295    /// conclusion records (`profile`/`concept`/`synthesis`).
296    pub fn type_shards(&self, type_: &str) -> bool {
297        // A `DB.md ## Schemas` `### <type>` block with a `shard:` directive is
298        // authoritative — it is the v0.2 generic-model way to declare sharding,
299        // so it overrides the built-in default below (in either direction).
300        if let Some(shard) = self.config.schemas.get(type_).and_then(|s| s.shard) {
301            return shard;
302        }
303        // Built-in default for the example types. Sharding is a property of the
304        // *type*:
305        //  - source types carry a primary date field and shard;
306        //  - event record types track business volume and shard;
307        //  - dedup-bounded entity types and curation-bounded conclusion
308        //    records (`profile`/`concept`/`synthesis`) stay flat.
309        // Any type can override this via a `shard:` directive (above).
310        matches!(
311            type_,
312            // source types (documentary + testimonial)
313            "email" | "transcript" | "pdf-source" | "note"
314            // event record types (canonical)
315            | "expense" | "invoice" | "meeting"
316            // event record types (recognized custom, per the plan)
317            | "order" | "ticket" | "transaction"
318        )
319    }
320
321    /// Compute the canonical write path for a new file. For a sharding type
322    /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
323    /// primary date field (`email.date`, `expense.date`, … fallback `created`)
324    /// under the type folder; flat types (entity + conclusion records) get no
325    /// shard segment.
326    /// Deterministic + stable: same input → same path, so a record never moves
327    /// once written.
328    pub fn shard_path_for(
329        &self,
330        type_: &str,
331        frontmatter: &Frontmatter,
332        name: &str,
333    ) -> Result<PathBuf, StoreError> {
334        self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
335    }
336
337    /// Like [`Store::shard_path_for`], but compute the path under an explicit,
338    /// caller-resolved type-folder rather than the canonical default. This lets a
339    /// write surface honour an agent-supplied conforming sub-folder — e.g. a
340    /// conclusion record filed under `records/profiles/`, `records/concepts/`, or
341    /// `records/synthesis/` (a conclusion record may be filed under ANY
342    /// `records/<folder>/`, not only its canonical one) — while still applying
343    /// date-sharding for sharding types. The folder must be a conforming
344    /// `<layer>/<type-folder>` (2
345    /// components, recognized layer); the caller is responsible for that (see the
346    /// CLI's `resolve_write_path`), so it is taken as given here.
347    ///
348    /// Sharding is still a property of the *type*: a sharding type gets the
349    /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
350    pub fn shard_path_in(
351        &self,
352        folder: &Path,
353        type_: &str,
354        frontmatter: &Frontmatter,
355        name: &str,
356    ) -> Result<PathBuf, StoreError> {
357        let folder = folder.to_path_buf();
358        let filename = ensure_md_extension(name);
359
360        if !self.type_shards(type_) {
361            // Flat type (entity records, conclusion records, decisions): no
362            // shard segment.
363            return Ok(folder.join(filename));
364        }
365
366        // Sharding type: derive <YYYY>/<MM> from the primary date field, with
367        // `created` as the universal fallback. Reading the public `Frontmatter`
368        // fields directly (typed `created`/`updated` + raw `extra`) avoids the
369        // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
370        let (year, month) = self
371            .primary_shard_segment(type_, frontmatter)
372            .ok_or_else(|| StoreError::NoShardDate {
373                file: folder.join(&filename),
374            })?;
375
376        Ok(folder.join(year).join(month).join(filename))
377    }
378
379    /// Find files with an incoming wiki-link to `target` via a **single
380    /// presence-only content scan** for an edge to `target` across all layers,
381    /// using the shared fence-aware/whitespace-trimmed/case-folded edge notion
382    /// ([`extract_edge_targets`]). Loop-fast; no whole-graph build. Returns
383    /// store-relative paths.
384    pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
385        // A single target is just the degenerate batch case — one key, one store
386        // scan. Routing through `find_links_to_any` keeps the
387        // pattern construction and the scan loop in exactly one place. The
388        // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
389        // single alloc on this single-target convenience path; the batch path
390        // validate.rs rides is untouched).
391        self.find_links_to_any(&[target.to_path_buf()])
392    }
393
394    /// Find every file with an incoming wiki-link to **any** of `targets`, in a
395    /// **single content pass** over the store (one `.md` walk, one presence-only
396    /// edge scan per file). This is the batch incoming-linker finder the
397    /// working-set [`crate::validate::validate_working_set`] sits on: it must find
398    /// the linkers for the *whole* changed set without paying a full store read
399    /// per changed object. Cost is therefore one store scan (O(store)), NOT
400    /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
401    /// in a loop would reread every `.md` once per target and is the exact
402    /// `O(changed × store)` blow-up this method exists to prevent. Returns
403    /// store-relative paths (deduped, sorted).
404    ///
405    /// **One edge notion with `forwardlinks`/`rename`/`validate`.** A file links
406    /// to a target iff [`extract_edge_targets`] (fence-aware, whitespace-trimmed)
407    /// of its content yields a target whose [`link_edge_key`] equals the target's
408    /// — the *same* definition the forward view and the rename rewriter use. The
409    /// previous implementation used a literal-adjacency ripgrep regex that (a)
410    /// matched `[[...]]` text inside fenced code examples (which validate treats
411    /// as non-edges), (b) missed inner-whitespace padding (`[[ x ]]`), and (c)
412    /// compared case-sensitively even where the filesystem resolves links
413    /// case-insensitively — so backlinks/links/rename silently disagreed with
414    /// forwardlinks and validate. Reading content and routing through the shared
415    /// extractor removes all three divergences.
416    ///
417    /// Why content scan and not the sidecar `links` field: the sidecar projects
418    /// only the frontmatter `links:` array, so it misses edges written in the
419    /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
420    /// arbitrary path therefore requires reading file content.
421    pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
422        // Build the set of comparison keys for the requested targets, in the
423        // canonical (case-folded where the filesystem is case-insensitive) form
424        // the edge extractor emits. An empty key (a target that renders to no
425        // link text, e.g. `""` or `"./"`) contributes nothing — and crucially the
426        // empty set short-circuits below so we never report every file.
427        let want: std::collections::HashSet<String> = targets
428            .iter()
429            .filter_map(|t| {
430                let canonical = canonical_link_target(&t.to_string_lossy());
431                if canonical.is_empty() {
432                    None
433                } else {
434                    Some(link_edge_key(&canonical))
435                }
436            })
437            .collect();
438        if want.is_empty() {
439            return Ok(Vec::new());
440        }
441
442        let mut hits = std::collections::BTreeSet::new();
443        // Scan every `.md` file in the store (skip hidden + `log/`), including
444        // `index.md` catalogs — an incoming reference is wherever the link text
445        // lives; the caller decides relevance. ONE walk for the whole target set;
446        // per file we stop at the first matching edge (presence is all we need),
447        // so a file that links to several targets is read once, not once per
448        // target.
449        for rel in self.walk_all_md()? {
450            let abs = self.abs_path(&rel);
451            // Read lossily: a `.md` verbatim-ingested into `sources/` can carry a
452            // stray non-UTF-8 byte (a mis-decoded Latin-1 import). Decoding
453            // lossily substitutes replacement characters instead of erroring, so
454            // one bad byte on a link-bearing line no longer aborts the whole
455            // store scan (the historical `UTF8`-sink failure). The link syntax is
456            // ASCII, so a replacement char elsewhere on the line never hides a
457            // `[[...]]`. A read error (not a decode error) is genuine I/O trouble
458            // and propagates.
459            let bytes = match std::fs::read(&abs) {
460                Ok(b) => b,
461                Err(e) => {
462                    return Err(StoreError::Search {
463                        root: self.root.clone(),
464                        message: format!("read failed in {}: {e}", abs.display()),
465                    })
466                }
467            };
468            let text = String::from_utf8_lossy(&bytes);
469            for target in extract_edge_targets(&text) {
470                if want.contains(&link_edge_key(&target)) {
471                    hits.insert(rel);
472                    break;
473                }
474            }
475        }
476        Ok(hits.into_iter().collect())
477    }
478
479    /// Candidate set for a `type` query: read every type-folder `index.jsonl`
480    /// sidecar in the type's single layer and return the records of that
481    /// `type`. Complete and cold-cache-proof — NOT a walk-and-parse or a
482    /// frontmatter ripgrep scan, and **never a store-wide read**.
483    ///
484    /// The read is bounded to the type's one layer subtree
485    /// (O(entities-in-layer)): a type lives in exactly one layer, and
486    /// `default_type_folder` always encodes it (recognized → its SPEC layer;
487    /// unrecognized → `records/`), so the walk never fans out across every
488    /// sidecar in the store and stays inside the interactive loop's
489    /// O(entities) contract.
490    ///
491    /// The whole-layer read — rather than reading only the type's canonical
492    /// folder sidecar when it happens to exist — is what makes the result
493    /// *complete*. A single `type` can legitimately be filed across several
494    /// folders within its layer: a conclusion `profile` filed under any
495    /// `records/<folder>/`, or a `contact` filed in `records/clients/` alongside
496    /// the canonical `records/contacts/`. The previous code read only the
497    /// canonical-guess sidecar whenever it was a file, which silently dropped
498    /// those non-canonical records the moment the canonical sidecar existed —
499    /// returning an incomplete set, and a *different* set as the store grew
500    /// (the omission flipped on once one canonical record was added). That
501    /// broke the dedup/enumeration premise this primitive backs and disagreed
502    /// with `find_by_where_in`, which already walks the whole layer. Filtering
503    /// the layer read by `type` keeps the result complete regardless of how the
504    /// type's records are foldered.
505    pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
506        let canonical_folder = default_type_folder(type_);
507        let records = self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?;
508        Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
509    }
510
511    /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
512    /// every type-folder `index.jsonl` sidecar and filter their records. The
513    /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
514    /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
515    pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
516        self.find_by_where_in(key, value, None)
517    }
518
519    /// Candidate set for a `key=value` frontmatter query, **scoped to one
520    /// layer** when `layer` is `Some`: the sidecar walk is confined to that
521    /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
522    /// not O(store records). `None` keeps the store-wide read.
523    ///
524    /// This is what makes `--in <layer>` an I/O scope, not just a result
525    /// filter: a `--where`-only query (no `--type`) used to read every sidecar
526    /// in the store and narrow by layer in memory, breaking the O(entities)
527    /// contract the interactive loop depends on. With a layer in hand we walk
528    /// only that layer's sidecars.
529    pub fn find_by_where_in(
530        &self,
531        key: &str,
532        value: &str,
533        layer: Option<Layer>,
534    ) -> Result<Vec<IndexRecord>, StoreError> {
535        // A `key=value` query can target any frontmatter field across any type,
536        // so within the chosen subtree we still read every type-folder sidecar
537        // and filter. The layer (when given) bounds *which* subtree, turning a
538        // whole-store walk into a single-layer walk.
539        let records = self.read_all_type_indexes_in(layer)?;
540        Ok(records
541            .into_iter()
542            .filter(|r| record_matches_field(r, key, value))
543            .collect())
544    }
545
546    /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
547    /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
548    /// else store-wide. Sequential, complete sidecar reads — never a
549    /// walk-and-parse of the content tree.
550    ///
551    /// This is the unfiltered sidecar-enumeration primitive the relationship
552    /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
553    /// candidate set to the relevant layer (or the whole store) without opening
554    /// the content tree, then confirms each candidate's edge by parsing the file.
555    pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
556        self.read_all_type_indexes_in(layer)
557    }
558
559    /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
560    /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
561    /// primitive every structured query sits on.
562    pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
563        let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
564            path: index_jsonl.to_path_buf(),
565            message: e.to_string(),
566        })?;
567
568        // Last-write-wins by `path` over un-compacted lines: a later line for
569        // the same path supersedes an earlier one (the jsonl is append-mostly
570        // and only compacted on rebuild). Blank lines are skipped; a non-blank
571        // line that is not a valid IndexRecord is a hard parse error.
572        let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
573        for (i, line) in text.lines().enumerate() {
574            let trimmed = line.trim();
575            if trimmed.is_empty() {
576                continue;
577            }
578            let record: IndexRecord =
579                serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
580                    path: index_jsonl.to_path_buf(),
581                    message: format!("line {}: {e}", i + 1),
582                })?;
583            by_path.insert(record.path.clone(), record);
584        }
585        // BTreeMap keyed by path → records emerge sorted by path ascending,
586        // a deterministic order independent of line order in the file.
587        Ok(by_path.into_values().collect())
588    }
589
590    /// Resolve a store-relative path to its absolute on-disk path under
591    /// [`root`](Store::root).
592    pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
593        // `Path::join` returns `store_relative` unchanged if it is already
594        // absolute, so passing an absolute path through is a no-op.
595        self.root.join(store_relative)
596    }
597
598    /// Convert an absolute path under the store into its store-relative form.
599    pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
600        abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
601    }
602
603    // ── Private helpers ─────────────────────────────────────────────────────
604
605    /// Resolve a caller-supplied folder path (store-relative or absolute) to an
606    /// absolute path under the store root.
607    fn resolve_under_root(&self, folder: &Path) -> PathBuf {
608        if folder.is_absolute() {
609            folder.to_path_buf()
610        } else {
611            self.root.join(folder)
612        }
613    }
614
615    /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
616    /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
617    /// and type-folder walks.
618    fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
619        let mut out = Vec::new();
620        for entry in self.md_walker(root).build() {
621            let entry = entry.map_err(|e| StoreError::Search {
622                root: root.to_path_buf(),
623                message: e.to_string(),
624            })?;
625            if !is_file_entry(&entry) {
626                continue;
627            }
628            let path = entry.path();
629            if !has_md_extension(path) {
630                continue;
631            }
632            if is_non_content_basename(path) {
633                continue;
634            }
635            if let Some(rel) = self.rel_path(path) {
636                out.push(rel);
637            }
638        }
639        out.sort();
640        Ok(out)
641    }
642
643    /// Walk the whole store for **every** `.md` file (including `index.md`),
644    /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
645    /// scan, where the literal link text can live in any markdown file.
646    fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
647        let mut out = Vec::new();
648        for entry in self.md_walker(&self.root).build() {
649            let entry = entry.map_err(|e| StoreError::Search {
650                root: self.root.clone(),
651                message: e.to_string(),
652            })?;
653            if !is_file_entry(&entry) {
654                continue;
655            }
656            let path = entry.path();
657            if !has_md_extension(path) {
658                continue;
659            }
660            if self.is_in_log_dir(path) {
661                continue;
662            }
663            if let Some(rel) = self.rel_path(path) {
664                out.push(rel);
665            }
666        }
667        out.sort();
668        Ok(out)
669    }
670
671    /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
672    /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
673    /// read with last-write-wins by path; across sidecars, paths are disjoint by
674    /// construction (one sidecar per folder), so a plain concatenation preserves
675    /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
676    /// is what keeps `find_by_where_in` O(entities-in-layer).
677    fn read_all_type_indexes_in(
678        &self,
679        layer: Option<Layer>,
680    ) -> Result<Vec<IndexRecord>, StoreError> {
681        let mut out = Vec::new();
682        for sidecar in self.find_type_index_files_in(layer)? {
683            out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
684        }
685        Ok(out)
686    }
687
688    /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
689    /// whole store (skip hidden + `log/`), returning store-relative paths. A
690    /// scoped read walks `<root>/<layer>/`; the store-wide read enumerates the
691    /// two canonical layer subtrees (`sources/`, `records/`) — the
692    /// same store model [`Store::walk`] uses — rather than walking from
693    /// `self.root`. Walking from root would descend into non-layer top-level
694    /// dirs (`EXPECTED/` test goldens, an `archive/` of frozen index copies,
695    /// any sibling folder holding store-relative `path`s), pulling their
696    /// sidecars in and returning every record twice. A non-existent layer
697    /// subtree yields no sidecars rather than walking a missing path.
698    fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
699        // Store-wide read: union the per-layer scoped reads so only the three
700        // content layers are walked (never root meta files or non-layer dirs),
701        // matching `Store::walk`. The per-layer paths are disjoint by folder, so
702        // a plain concatenation preserves completeness.
703        let Some(layer) = layer else {
704            let mut out = Vec::new();
705            for l in Layer::all() {
706                out.extend(self.find_type_index_files_in(Some(l))?);
707            }
708            out.sort();
709            return Ok(out);
710        };
711        let walk_root = self.root.join(layer.dir_name());
712        // A scoped walk over a layer folder that does not exist yet must be an
713        // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
714        // error from `ignore` over a nonexistent path.
715        if !walk_root.is_dir() {
716            return Ok(Vec::new());
717        }
718        let mut out = Vec::new();
719        let mut builder = WalkBuilder::new(&walk_root);
720        builder
721            .standard_filters(false)
722            .hidden(true)
723            .follow_links(true);
724        for entry in builder.build() {
725            let entry = entry.map_err(|e| StoreError::Search {
726                root: walk_root.clone(),
727                message: e.to_string(),
728            })?;
729            if !is_file_entry(&entry) {
730                continue;
731            }
732            let path = entry.path();
733            if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
734                continue;
735            }
736            if self.is_in_log_dir(path) {
737                continue;
738            }
739            if let Some(rel) = self.rel_path(path) {
740                out.push(rel);
741            }
742        }
743        out.sort();
744        Ok(out)
745    }
746
747    /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
748    /// OFF (a SWEEP must see every file even if the store is a git repo with a
749    /// `.gitignore`), but hidden files/dirs are skipped. Symlinks are
750    /// **followed** (`follow_links(true)`) so a symlinked `.md` content file or
751    /// a symlinked type folder (e.g. `records/companies -> /other/disk/...`) is
752    /// walked like any other content rather than silently vanishing; a symlinked
753    /// layer dir was already traversed (the walk root is followed), so following
754    /// symlinks one level deeper just removes that inconsistency.
755    fn md_walker(&self, root: &Path) -> WalkBuilder {
756        let mut builder = WalkBuilder::new(root);
757        builder
758            .standard_filters(false)
759            .hidden(true)
760            .follow_links(true);
761        builder
762    }
763
764    /// True if an absolute path lives under the store's root-level `log/`
765    /// rotation-archive directory.
766    fn is_in_log_dir(&self, abs: &Path) -> bool {
767        match self.rel_path(abs) {
768            Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
769            None => false,
770        }
771    }
772
773    /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
774    /// returning `None` when absent/unparseable. A self-contained reader (does
775    /// not depend on the not-yet-implemented `parser::read_file`); parses the
776    /// leading `---`-fenced YAML block with the same engine the parser uses.
777    fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
778        let text = std::fs::read_to_string(abs).ok()?;
779        let yaml = frontmatter_block(&text)?;
780        let value: serde_norway::Value = serde_norway::from_str(yaml).ok()?;
781        let raw = value.get("updated")?;
782        value_to_datetime(raw)
783    }
784
785    /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
786    /// date field with a `created` fallback. Reads the public `Frontmatter`
787    /// fields directly. `None` when no usable date is present.
788    fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
789        // Try the type's primary date field first.
790        if let Some(field) = primary_date_field(type_) {
791            if let Some(v) = fm.extra.get(field) {
792                if let Some(seg) = value_to_year_month(v) {
793                    return Some(seg);
794                }
795            }
796        }
797        // Universal fallback: the typed `created` timestamp.
798        fm.created
799            .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
800    }
801}
802
803// ── Path containment (security) ─────────────────────────────────────────────
804
805/// Canonicalize `candidate` (resolving symlinks; for a not-yet-existing leaf,
806/// canonicalize its existing parent chain and re-append the leaf) and return it
807/// only if it resolves inside `store_root`; otherwise `Err`.
808///
809/// This is the single within-store containment gate. A wiki-link target, a
810/// rename destination, or any other caller-influenced path must pass through
811/// here before it is read or traversed, so a `..`-laden or symlink-escaping
812/// target can never turn a store operation into a read of an arbitrary file
813/// outside the store. `store_root` itself is canonicalized first so the
814/// `starts_with` comparison is symlink-stable on both sides (e.g. macOS's
815/// `/tmp` → `/private/tmp`).
816pub fn ensure_path_within_store(store_root: &Path, candidate: &Path) -> std::io::Result<PathBuf> {
817    reject_parent_components(store_root, candidate)?;
818
819    // Canonicalize the root so both sides of the containment check are in the
820    // same (fully-resolved) namespace. This also resolves any `..` the root
821    // itself carries (the user-supplied `--dir`), which the tail-only lexical
822    // check deliberately leaves in place.
823    let root = store_root.canonicalize()?;
824    resolve_within(&root, store_root, candidate)
825}
826
827/// The lexical half of the containment gate: reject any `..` component in the
828/// caller-influenced tail of `candidate` (the part beyond the trusted
829/// `store_root` prefix).
830fn reject_parent_components(store_root: &Path, candidate: &Path) -> std::io::Result<()> {
831    // The `..` rejection below must apply only to the *caller-influenced* tail of
832    // the candidate — never to a `..` the trusted `store_root` itself carries.
833    // Callers build the candidate as `store_root.join(rel)`, so a user-supplied
834    // `--dir ../../some/store` legitimately seeds every candidate with leading
835    // `..` components that belong to the root, not to the sidecar/link target.
836    // Strip the trusted `store_root` prefix lexically and scrutinize only what
837    // remains; the root's own `..` is resolved safely by `canonicalize()` just
838    // below. A candidate that does NOT begin with `store_root` (an absolute
839    // out-of-store path, a CWD-relative target) keeps the whole path under
840    // scrutiny — there is no trusted prefix to exempt.
841    let scrutinized = candidate.strip_prefix(store_root).unwrap_or(candidate);
842
843    // Reject any `..` component in the scrutinized tail. A `ParentDir` can never
844    // be resolved safely by lexical normalization: once a symlink sits earlier in
845    // the path, `foo/../bar` does NOT equal `bar`, and canonicalizing the existing
846    // prefix (below) would silently collapse `records/contacts/../../outside` down
847    // to a path that *appears* inside the root, masking the traversal. There is no
848    // legitimate in-store caller that needs `..` in the tail — wiki-link targets,
849    // rename destinations, and graph reads are all forward (`Normal`-only) paths —
850    // so a tail `..` is always either an escape attempt or a malformed target.
851    if scrutinized
852        .components()
853        .any(|c| matches!(c, std::path::Component::ParentDir))
854    {
855        return Err(std::io::Error::new(
856            std::io::ErrorKind::PermissionDenied,
857            format!(
858                "path {} contains a `..` component beyond the store root {} and cannot be contained",
859                candidate.display(),
860                store_root.display()
861            ),
862        ));
863    }
864    Ok(())
865}
866
867/// The resolution half of the containment gate, against a pre-canonicalized
868/// `root`: canonicalize `candidate` as far as it exists (peeling a virtual
869/// tail), reassemble, and require the result to stay under `root`.
870fn resolve_within(root: &Path, store_root: &Path, candidate: &Path) -> std::io::Result<PathBuf> {
871    // Resolve the candidate as far as it exists on disk. `canonicalize` fails on
872    // a not-yet-existing leaf, so peel trailing components until the remaining
873    // prefix exists, canonicalize that, then re-append the peeled tail. This
874    // resolves any symlink in the existing parent chain (an escape vector) while
875    // still working for a target that does not exist yet (a rename destination).
876    let mut existing = candidate.to_path_buf();
877    let mut tail: Vec<std::ffi::OsString> = Vec::new();
878    let resolved_prefix = loop {
879        match existing.canonicalize() {
880            Ok(p) => break p,
881            Err(_) => {
882                // No existing prefix left to canonicalize → resolve relative to
883                // the canonical root (the candidate is somewhere under, or
884                // escaping from, the store) and let the containment check below
885                // decide. Pop one component and keep peeling.
886                match existing.file_name() {
887                    Some(name) => {
888                        tail.push(name.to_os_string());
889                        if !existing.pop() {
890                            // Ran out of components without finding an existing
891                            // prefix: anchor the un-resolvable remainder at the
892                            // canonical root so a relative candidate is judged
893                            // against the store, not the process CWD.
894                            break root.to_path_buf();
895                        }
896                    }
897                    None => {
898                        // A root/prefix component with no file name and no
899                        // on-disk existence: anchor at the canonical root.
900                        break root.to_path_buf();
901                    }
902                }
903            }
904        }
905    };
906
907    // Reassemble: canonical existing prefix + the peeled (still-virtual) tail,
908    // in original order (the peel pushed them reversed).
909    let mut resolved = resolved_prefix;
910    for name in tail.into_iter().rev() {
911        resolved.push(name);
912    }
913
914    if resolved.starts_with(root) {
915        Ok(resolved)
916    } else {
917        Err(outside_store_err(candidate, store_root))
918    }
919}
920
921fn outside_store_err(candidate: &Path, store_root: &Path) -> std::io::Error {
922    std::io::Error::new(
923        std::io::ErrorKind::PermissionDenied,
924        format!(
925            "path {} resolves outside the store root {}",
926            candidate.display(),
927            store_root.display()
928        ),
929    )
930}
931
932/// Hot-loop companion to [`ensure_path_within_store`]: identical per-candidate
933/// semantics, amortized cost. The single-shot gate re-canonicalizes the store
934/// root and walks the candidate's whole parent chain via `canonicalize` on
935/// every call — two realpath(3) chains per candidate, which at a 10k-file scan
936/// set dominates the scan itself. This helper canonicalizes the root ONCE at
937/// construction and memoizes each distinct parent directory's canonical form
938/// (scan candidates cluster into a few dozen type/shard folders), so the
939/// common candidate — an existing, non-symlink file in a known folder — costs
940/// one `lstat(2)` and a prefix check. Symlink leaves, missing files, and other
941/// corners fall back to the same full peel-resolution the single-shot gate
942/// runs, so no candidate gets a weaker check: a poisoned path still resolves
943/// (or fails) exactly as before.
944pub struct StoreContainment {
945    store_root: PathBuf,
946    /// The store root, canonicalized once at construction.
947    root: PathBuf,
948    /// Parent dir → its canonical form (memoized realpath).
949    dirs: BTreeMap<PathBuf, PathBuf>,
950}
951
952impl StoreContainment {
953    /// Canonicalize the store root once. Errs only if the root itself cannot
954    /// resolve (deleted mid-operation) — the same condition that would fail
955    /// every single-shot gate call.
956    pub fn new(store_root: &Path) -> std::io::Result<Self> {
957        Ok(Self {
958            store_root: store_root.to_path_buf(),
959            root: store_root.canonicalize()?,
960            dirs: BTreeMap::new(),
961        })
962    }
963
964    /// [`ensure_path_within_store`], amortized: same acceptance set, same
965    /// rejection set (see the struct doc).
966    pub fn resolve(&mut self, candidate: &Path) -> std::io::Result<PathBuf> {
967        reject_parent_components(&self.store_root, candidate)?;
968
969        // Fast path: an existing, non-symlink leaf under a memoizable parent.
970        // `symlink_metadata` (lstat, no path resolution) both proves existence
971        // and rules out a symlink leaf; the parent's canonical form resolves
972        // every symlink earlier in the chain, so `canonical(parent) + leaf` is
973        // exactly what `canonicalize(candidate)` would return.
974        if let (Ok(meta), Some(parent), Some(name)) = (
975            std::fs::symlink_metadata(candidate),
976            candidate.parent(),
977            candidate.file_name(),
978        ) {
979            if !meta.file_type().is_symlink() {
980                let canon_parent = match self.dirs.get(parent) {
981                    Some(p) => p.clone(),
982                    None => {
983                        let p = parent.canonicalize()?;
984                        self.dirs.insert(parent.to_path_buf(), p.clone());
985                        p
986                    }
987                };
988                let resolved = canon_parent.join(name);
989                return if resolved.starts_with(&self.root) {
990                    Ok(resolved)
991                } else {
992                    Err(outside_store_err(candidate, &self.store_root))
993                };
994            }
995        }
996
997        // Slow path — symlink leaf, missing file, no parent: the full peel,
998        // against the already-canonical root.
999        resolve_within(&self.root, &self.store_root, candidate)
1000    }
1001}
1002
1003// ── The shared wiki-link edge notion (graph / stats / validate / rename) ─────
1004//
1005// One definition of "what `[[...]]` text is a real edge" that every relationship
1006// op keys on, so `forwardlinks`, `backlinks`, `links`, `stats`, and `rename`
1007// never disagree with each other (or with `validate`'s body extractor):
1008//
1009//   1. **Fence-aware.** A `[[...]]` inside a ``` / ~~~ fenced code block is a
1010//      documentation example, not an edge — exactly `validate`'s rule. Counting
1011//      it as an edge over-reports backlinks, falsely un-orphans the page, and
1012//      (worst) lets `rename` rewrite verbatim example text.
1013//   2. **Whitespace-trimmed.** `[[ records/contacts/sarah ]]` is the same edge
1014//      as `[[records/contacts/sarah]]`. The inner padding is cosmetic; both the
1015//      forward and the backward view must resolve it identically.
1016//   3. **Case-folded to the filesystem.** Link *resolution* is `is_file()`,
1017//      which is case-insensitive on macOS/Windows. So on a case-insensitive
1018//      filesystem `[[records/contacts/Sarah-Chen]]` and the on-disk
1019//      `sarah-chen.md` are the SAME edge; the comparison key must case-fold to
1020//      match, or backlinks/rename silently miss the link while validate (which
1021//      resolves via the filesystem) considers it fine.
1022
1023/// Canonicalize a raw `[[...]]` inner target into the wiki-link key: forward
1024/// slashes, no leading `./` or `/`, no trailing `.md`, inner whitespace trimmed.
1025/// The single key forward and backward edges are compared on. Pairs with
1026/// [`link_edge_key`] for the case-fold step.
1027pub fn canonical_link_target(raw: &str) -> String {
1028    let mut s = raw.trim().replace('\\', "/");
1029    while let Some(rest) = s.strip_prefix("./") {
1030        s = rest.to_string();
1031    }
1032    let s = s.trim_start_matches('/');
1033    let s = s.strip_suffix(".md").unwrap_or(s);
1034    s.trim().to_string()
1035}
1036
1037/// The comparison key for a canonical link target. Two normalizations, applied
1038/// in order, so the string-keyed edge comparison agrees with how the filesystem
1039/// resolves the same link:
1040///
1041///   1. **Unicode NFC, always.** macOS/APFS folds NFC and NFD forms of a name to
1042///      the same file, so a file `records/contacts/josé.md` written NFC
1043///      (`é` = U+00E9) and a link `[[records/contacts/josé]]` written NFD
1044///      (`e` + U+0301) name the *same* file on disk — yet their raw UTF-8 bytes
1045///      differ. Without normalization the graph keys them as two different
1046///      targets, so `backlinks`/`forwardlinks` miss the edge and `orphans` flags
1047///      a linked-to file as an orphan, while `validate` (which resolves through
1048///      the filesystem) sees the link as live: the surfaces silently disagree.
1049///      Normalizing BOTH sides to NFC here makes the comparison
1050///      normalization-insensitive, matching the filesystem. This lives in the
1051///      comparison key — not in [`canonical_link_target`] — so the canonical
1052///      form stays byte/normalization-preserving (rename REWRITE output is never
1053///      silently re-normalized); both the link target and the file path pass
1054///      through this function, so NFC here is sufficient to unify them.
1055///   2. **ASCII case-fold on a case-insensitive filesystem.** Identity on a
1056///      case-sensitive FS, ASCII-lowercased on macOS/Windows, so the comparison
1057///      also agrees with the filesystem's case-folding `is_file()` resolution.
1058///
1059/// Callers compare `link_edge_key(a) == link_edge_key(b)`.
1060pub fn link_edge_key(canonical_target: &str) -> String {
1061    use unicode_normalization::UnicodeNormalization;
1062    // NFC first — always, on every platform: the graph must agree across hosts,
1063    // and the comparison must be normalization-insensitive regardless of which
1064    // host's filesystem folded the on-disk name.
1065    let nfc: String = canonical_target.nfc().collect();
1066    if fs_is_case_insensitive() {
1067        nfc.to_ascii_lowercase()
1068    } else {
1069        nfc
1070    }
1071}
1072
1073/// Extract every wiki-link edge target from a markdown body, fence-aware and
1074/// whitespace-trimmed, in document order (duplicates kept — callers dedup).
1075/// Returns canonical targets (see [`canonical_link_target`]); the case-fold for
1076/// comparison is applied separately via [`link_edge_key`] so the canonical form
1077/// (used for rewrites/output) stays case-preserving.
1078///
1079/// Scans line-by-line tracking the fence state inline (no whole-body
1080/// allocation), exactly mirroring validate's `extract_wiki_links`: the fence
1081/// state is a `(fence char, run length)` tracked via [`fence_opens`] /
1082/// [`fence_closes`] — NOT a bool toggled on any ``` / `~~~` line. The naive
1083/// toggle inverts mid-block when a `~~~` block legally contains a ```` ``` ````
1084/// line (the standard way to document a backtick fence), or when a `>3`-space-
1085/// indented ``` is mistaken for a fence — both of which would let a fenced
1086/// example `[[…]]` leak out as a live edge (a false dependent for
1087/// backlinks/rename). Fenced lines never yield edges. Within a line, the text
1088/// before the first `|` is the target; a target whose trimmed form starts with
1089/// `[` is the rejected triple-bracket flow-form list mis-encoding
1090/// (`[[[a]], [[b]]]`), not a real link — skipped, matching validate.
1091///
1092/// Accepts a whole file's text *or* a body-only fragment. A leading `---`
1093/// frontmatter block is YAML, not markdown: it has no code fences, and a
1094/// `[[…]]` in any frontmatter field is a real edge. The frontmatter is therefore
1095/// scanned WITHOUT fence tracking, and the body is scanned with a FRESH fence
1096/// state — so a stray ``` / `~~~` inside a frontmatter value can never open a
1097/// fence that swallows the body's real wiki-links. (Callers `search_by_link`,
1098/// `forwardlinks`, and `dbmd graph backlinks` all pass full file text; without this
1099/// boundary reset a fenced frontmatter value silently dropped every subsequent
1100/// body edge — under-reporting backlinks/forwardlinks/`links`.) A fragment with
1101/// no leading frontmatter takes the body path unchanged.
1102pub fn extract_edge_targets(text: &str) -> Vec<String> {
1103    let mut out = Vec::new();
1104    // Split off a leading `---`…`---` frontmatter block (raw — no YAML parse, so
1105    // a malformed file is still fully scanned). Frontmatter links are edges but
1106    // must not participate in code-fence state.
1107    let body = match split_frontmatter_raw(text) {
1108        Some((frontmatter, body)) => {
1109            for line in frontmatter.lines() {
1110                push_edges_in_line(line, &mut out);
1111            }
1112            body
1113        }
1114        None => text,
1115    };
1116    let mut fence: Option<(u8, usize)> = None;
1117    for line in body.lines() {
1118        let content = line.trim_end_matches('\r');
1119        if let Some(f) = fence {
1120            if fence_closes(content, f) {
1121                fence = None;
1122            }
1123            continue;
1124        }
1125        if let Some(opened) = fence_opens(content) {
1126            fence = Some(opened);
1127            continue;
1128        }
1129        push_edges_in_line(line, &mut out);
1130    }
1131    out
1132}
1133
1134/// Push every `[[target]]` on one line into `out`, alias-stripped (`[[a|b]]` →
1135/// `a`), trimmed, and canonicalized. The triple-bracket flow-form mis-encoding
1136/// (`[[[a]], …]`) is skipped, matching validate. Shared by both the frontmatter
1137/// and body scans in [`extract_edge_targets`] so they honor one link grammar.
1138fn push_edges_in_line(line: &str, out: &mut Vec<String>) {
1139    let bytes = line.as_bytes();
1140    let mut i = 0usize;
1141    while i + 1 < bytes.len() {
1142        if bytes[i] == b'[' && bytes[i + 1] == b'[' {
1143            if let Some(close) = line[i + 2..].find("]]") {
1144                let inner = &line[i + 2..i + 2 + close];
1145                let raw_target = inner.split('|').next().unwrap_or(inner).trim();
1146                if !raw_target.is_empty() && !raw_target.starts_with('[') {
1147                    let canonical = canonical_link_target(raw_target);
1148                    if !canonical.is_empty() {
1149                        out.push(canonical);
1150                    }
1151                }
1152                i = i + 2 + close + 2;
1153                continue;
1154            }
1155        }
1156        i += 1;
1157    }
1158}
1159
1160/// If `line` opens a fenced code block, return `(fence byte, run length)`. The
1161/// single fence-open rule shared by [`extract_edge_targets`] and graph's
1162/// `rewrite_links_to`, mirroring validate's `fence_opens` and the parser's
1163/// `opening_fence` so every link op tracks fences identically: a fence is
1164/// ```` ``` ```` or `~~~` (run ≥ 3) at ≤ 3 spaces of indent, and a backtick
1165/// fence's info string may not itself contain a backtick.
1166pub fn fence_opens(line: &str) -> Option<(u8, usize)> {
1167    let indent = line.len() - line.trim_start_matches(' ').len();
1168    if indent > 3 {
1169        return None;
1170    }
1171    let rest = &line[indent..];
1172    let byte = rest.bytes().next()?;
1173    if byte != b'`' && byte != b'~' {
1174        return None;
1175    }
1176    let run = rest.len() - rest.trim_start_matches(byte as char).len();
1177    if run < 3 {
1178        return None;
1179    }
1180    // A backtick fence's info string may not itself contain a backtick.
1181    if byte == b'`' && rest[run..].contains('`') {
1182        return None;
1183    }
1184    Some((byte, run))
1185}
1186
1187/// True if `line` closes the currently open `fence`: same char, run at least as
1188/// long, nothing but trailing whitespace after. Mirrors validate's
1189/// `fence_closes` / the parser's `is_closing_fence`, so an inner fence of the
1190/// *other* character (a ```` ``` ```` line inside a `~~~` block) does NOT close
1191/// the outer fence.
1192pub fn fence_closes(line: &str, fence: (u8, usize)) -> bool {
1193    let (byte, open_len) = fence;
1194    let indent = line.len() - line.trim_start_matches(' ').len();
1195    if indent > 3 {
1196        return false;
1197    }
1198    let rest = &line[indent..];
1199    let run = rest.len() - rest.trim_start_matches(byte as char).len();
1200    if run < open_len {
1201        return false;
1202    }
1203    rest[run..].trim().is_empty()
1204}
1205
1206/// True when the host filesystem resolves paths case-insensitively (macOS/
1207/// Windows default). Probed once per process against the OS temp dir by creating
1208/// a lowercase marker and stat-ing its uppercase spelling. A probe failure
1209/// conservatively reports `false` (case-sensitive) — the historical behavior —
1210/// so a transient temp-dir issue never silently widens matching.
1211fn fs_is_case_insensitive() -> bool {
1212    use std::sync::OnceLock;
1213    static CASE_INSENSITIVE: OnceLock<bool> = OnceLock::new();
1214    *CASE_INSENSITIVE.get_or_init(|| {
1215        let dir = std::env::temp_dir();
1216        let pid = std::process::id();
1217        let nanos = SystemTime::now()
1218            .duration_since(UNIX_EPOCH)
1219            .map(|d| d.as_nanos())
1220            .unwrap_or(0);
1221        let lower = dir.join(format!(".dbmd-case-probe-{pid}-{nanos}"));
1222        let upper = dir.join(format!(".DBMD-CASE-PROBE-{pid}-{nanos}"));
1223        // Create the lowercase marker; if its uppercase spelling then resolves to
1224        // a file, the filesystem folded the case → case-insensitive.
1225        let result = match std::fs::File::create(&lower) {
1226            Ok(_) => upper.is_file(),
1227            Err(_) => false,
1228        };
1229        let _ = std::fs::remove_file(&lower);
1230        result
1231    })
1232}
1233
1234// ── Free helpers (no `self`) ────────────────────────────────────────────────
1235
1236/// True if a walk entry is a regular file, **following symlinks** so a
1237/// symlinked `.md` content file (or a file inside a symlinked type folder) is
1238/// counted like any other content file.
1239///
1240/// The store walks enable `follow_links(true)`, so a symlink entry's
1241/// `file_type()` still reports `is_symlink()` (the `ignore` walker does not
1242/// rewrite the entry's own type), not the followed target's type. Treat a
1243/// symlink whose target is a regular file as a file: `stat` (follow) the path
1244/// and check. A broken symlink (no target) is not a file.
1245fn is_file_entry(entry: &ignore::DirEntry) -> bool {
1246    match entry.file_type() {
1247        Some(ft) if ft.is_file() => true,
1248        Some(ft) if ft.is_symlink() => std::fs::metadata(entry.path())
1249            .map(|m| m.is_file())
1250            .unwrap_or(false),
1251        // A `None` file type (the walk root itself) or a non-file/non-symlink
1252        // entry is not a content file.
1253        _ => false,
1254    }
1255}
1256
1257/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
1258/// lowercase `.md`).
1259fn has_md_extension(path: &Path) -> bool {
1260    path.extension().and_then(|e| e.to_str()) == Some("md")
1261}
1262
1263/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
1264/// `log.md`) that the content walks must skip.
1265fn is_non_content_basename(path: &Path) -> bool {
1266    match path.file_name().and_then(|n| n.to_str()) {
1267        Some(name) => NON_CONTENT_BASENAMES.contains(&name),
1268        None => false,
1269    }
1270}
1271
1272/// Append `.md` to a bare name; leave an existing `.md` untouched.
1273fn ensure_md_extension(name: &str) -> String {
1274    if name.ends_with(".md") {
1275        name.to_string()
1276    } else {
1277        format!("{name}.md")
1278    }
1279}
1280
1281/// The canonical default folder for a recognized type, per the SPEC type table
1282/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
1283/// types fall back to `records/<type>` (the bare type name, no pluralization
1284/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
1285fn default_type_folder(type_: &str) -> PathBuf {
1286    let path = match type_ {
1287        // sources — documentary
1288        "email" => "sources/emails",
1289        "transcript" => "sources/transcripts",
1290        "pdf-source" => "sources/docs",
1291        // sources — testimonial (a human told the agent X)
1292        "note" => "sources/notes",
1293        // records — entities
1294        "contact" => "records/contacts",
1295        "company" => "records/companies",
1296        // records — events
1297        "expense" => "records/expenses",
1298        "meeting" => "records/meetings",
1299        "decision" => "records/decisions",
1300        "invoice" => "records/invoices",
1301        // unrecognized: bare type name under records/ (conclusions and any
1302        // custom type land here, e.g. `concept` → `records/concept`).
1303        other => return PathBuf::from("records").join(other),
1304    };
1305    PathBuf::from(path)
1306}
1307
1308/// The canonical [`Layer`] a `type_` belongs to, derived from its default
1309/// type-folder (`email` → `Sources`, `contact` → `Records`, a conclusion
1310/// `profile` → `Records`, unrecognized → `Records`). The write path uses this to decide whether
1311/// an agent-supplied folder is in the *right* layer for the type before honouring
1312/// its sub-folder choice.
1313pub fn layer_for_type(type_: &str) -> Layer {
1314    layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
1315}
1316
1317/// The [`Layer`] a type-folder path lives in, read from its first component
1318/// (`sources/` → `Sources`, `records/` → `Records`). Used to
1319/// bound [`Store::find_by_type`]'s whole-layer sidecar read to a single layer
1320/// subtree. Returns `None` for a path with no recognized layer prefix; every
1321/// value [`default_type_folder`] produces has one, so in practice this is
1322/// always `Some` on the call path — `None` degrades to a store-wide read.
1323fn layer_of_folder(folder: &Path) -> Option<Layer> {
1324    let first = folder.components().next()?.as_os_str().to_str()?;
1325    Layer::from_dir_name(first)
1326}
1327
1328/// True if a store-relative path is a db.md **content** file: rooted in a real
1329/// layer (`sources/` or `records/` as its FIRST component), with a `.md`
1330/// extension, and not an `index.md` sidecar. This is the SPEC's "content files =
1331/// everything under `sources/` and `records/` only" predicate (SPEC § content
1332/// files), keyed on the *first* component so a non-layer top-level dir is never
1333/// content even if a deeper component happens to be named `records`/`sources`
1334/// (e.g. `EXPECTED/records/x.md`, `archive/sources/y.md`).
1335///
1336/// It mirrors the graph engine's content filter so the surfaces that READ the
1337/// store (`graph backlinks`) and the surface that MUTATES it (`rename`) agree on
1338/// exactly which files are content. `rename` uses it to restrict its
1339/// link-rewrite set: a store-root file, a non-layer dir (`scratch/`,
1340/// `EXPECTED/`, `archive/`), or an `index.md` is NEVER rewritten — `rename` does
1341/// not own those bytes. The broad store scan ([`Store::find_links_to_any`],
1342/// shared with the read-only working-set validate) is left untouched; the filter
1343/// is applied at the point of mutation.
1344pub fn is_content_path(rel: &Path) -> bool {
1345    if layer_of_folder(rel).is_none() {
1346        return false;
1347    }
1348    if rel.extension().and_then(|e| e.to_str()) != Some("md") {
1349        return false;
1350    }
1351    rel.file_name().and_then(|n| n.to_str()) != Some("index.md")
1352}
1353
1354/// Infer a content file's canonical `type` from its store-relative path — the
1355/// inverse of [`default_type_folder`] and the single source of truth for
1356/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
1357///
1358/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
1359/// shorter path (a file directly under a layer) or an unknown leading layer
1360/// yields `None`.
1361///
1362/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
1363/// unrecognized folder the fallback is the **bare folder name verbatim** (no
1364/// pluralization/singularization) so it round-trips with `default_type_folder`,
1365/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
1366/// Singularizing here would break that round-trip (`records/tasks` → `task`
1367/// while `default_type_folder("task")` → `records/task`). A conclusion record's
1368/// folder (e.g. `records/profiles/`) infers its bare folder name (`profiles`),
1369/// the same custom-type fallback as any other unrecognized folder.
1370pub fn infer_type_from_path(rel: &Path) -> Option<String> {
1371    let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
1372    let layer = comps.next()?;
1373    if !matches!(layer, "sources" | "records") {
1374        return None;
1375    }
1376    let folder = comps.next()?;
1377    // The file itself must be a third component (a real type-folder, not the
1378    // file sitting directly under the layer).
1379    comps.next()?;
1380
1381    let mapped = match (layer, folder) {
1382        ("sources", "emails") => "email",
1383        ("sources", "transcripts") => "transcript",
1384        ("sources", "docs") => "pdf-source",
1385        ("sources", "notes") => "note",
1386        ("records", "contacts") => "contact",
1387        ("records", "companies") => "company",
1388        ("records", "expenses") => "expense",
1389        ("records", "meetings") => "meeting",
1390        ("records", "decisions") => "decision",
1391        ("records", "invoices") => "invoice",
1392        // Unrecognized folder: the bare name, verbatim. This is the inverse of
1393        // `default_type_folder`'s unrecognized fallback (`other → records/other`)
1394        // and the round-trip would break if we pluralized/singularized here.
1395        (_, other) => other,
1396    };
1397    Some(mapped.to_string())
1398}
1399
1400/// The primary date field name for a sharding type (the field whose value
1401/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
1402fn primary_date_field(type_: &str) -> Option<&'static str> {
1403    match type_ {
1404        "email" => Some("date"),
1405        "transcript" => Some("recorded_at"),
1406        "pdf-source" => Some("received_at"),
1407        "note" => Some("told_at"),
1408        "expense" | "invoice" | "meeting" => Some("date"),
1409        // recognized custom event types have no canonical date field name; they
1410        // fall back to `created`.
1411        _ => None,
1412    }
1413}
1414
1415/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
1416/// string and a YAML-native scalar rendered to string.
1417fn value_to_datetime(value: &serde_norway::Value) -> Option<DateTime<FixedOffset>> {
1418    let s = yaml_scalar_string(value)?;
1419    DateTime::parse_from_rfc3339(s.trim()).ok()
1420}
1421
1422/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
1423/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
1424/// `2026-05-22T10:00:00-07:00` timestamp both work.
1425fn value_to_year_month(value: &serde_norway::Value) -> Option<(String, String)> {
1426    let s = yaml_scalar_string(value)?;
1427    year_month_from_str(s.trim())
1428}
1429
1430/// `(YYYY, MM)` from the leading `YYYY-M` or `YYYY-MM` of a date string, with
1431/// the month returned zero-padded to two digits.
1432///
1433/// The month may be single- OR double-digit so that `2026-1-15` and its
1434/// zero-padded twin `2026-01-15` shard to the *same* `2026/01` folder. This
1435/// matches the lenient `date`-shape validator (`is_iso8601_date_or_datetime`,
1436/// chrono `%Y-%m-%d`), which accepts an unpadded month — without this, a value
1437/// the validator treats as a valid date is silently mis-filed under the
1438/// `created`-fallback month. Genuinely non-date input still returns `None`.
1439fn year_month_from_str(s: &str) -> Option<(String, String)> {
1440    // Hand-roll the leading-`YYYY-M[M]` parse to avoid a regex compile on the
1441    // write path. Split on '-': require a 4-digit year, then a 1-or-2-digit
1442    // numeric month in 1..=12. Anything after the month (a `-DD` day, a `T...`
1443    // time) is ignored — the day field never separates the leading date.
1444    let mut parts = s.splitn(3, '-');
1445    let year = parts.next()?;
1446    let month_part = parts.next()?;
1447
1448    // Year: exactly 4 ASCII digits.
1449    if year.len() != 4 || !year.bytes().all(|b| b.is_ascii_digit()) {
1450        return None;
1451    }
1452
1453    // Month: 1 or 2 ASCII digits, value 1..=12. Padded to two digits on output.
1454    if month_part.is_empty()
1455        || month_part.len() > 2
1456        || !month_part.bytes().all(|b| b.is_ascii_digit())
1457    {
1458        return None;
1459    }
1460    let month: u8 = month_part.parse().ok()?;
1461    if !(1..=12).contains(&month) {
1462        return None;
1463    }
1464
1465    Some((year.to_string(), format!("{month:02}")))
1466}
1467
1468/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
1469/// value's compact YAML serialization (covers timestamps that the YAML engine
1470/// may surface as a non-string scalar).
1471fn yaml_scalar_string(value: &serde_norway::Value) -> Option<String> {
1472    if let Some(s) = value.as_str() {
1473        return Some(s.to_string());
1474    }
1475    match value {
1476        serde_norway::Value::Null => None,
1477        serde_norway::Value::Mapping(_) | serde_norway::Value::Sequence(_) => None,
1478        other => serde_norway::to_string(other)
1479            .ok()
1480            .map(|s| s.trim().to_string()),
1481    }
1482}
1483
1484/// The YAML frontmatter block of a file: the text between a leading `---` fence
1485/// and the next `---` fence, exclusive. `None` if the file does not open with a
1486/// `---` fence on its first line.
1487fn frontmatter_block(text: &str) -> Option<&str> {
1488    // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
1489    let body = text.strip_prefix('\u{feff}').unwrap_or(text);
1490    let mut rest = body;
1491    // First line must be exactly `---`, tolerating trailing whitespace (CR, but
1492    // also spaces/tabs) — matching the canonical parser (`parser.rs` /
1493    // `index.rs`'s `extract_frontmatter_block`). A strict `\r`-only trim missed a
1494    // `--- ` fence, so `read_updated` returned None and date-sharding silently
1495    // fell back, disagreeing with the sidecar the rest of the toolkit builds.
1496    let (first, after_first) = split_first_line(rest);
1497    if first.trim_end() != "---" {
1498        return None;
1499    }
1500    rest = after_first;
1501    let block_start = rest;
1502    let mut scanned = 0usize;
1503    loop {
1504        let (line, after) = split_first_line(rest);
1505        if line.trim_end() == "---" {
1506            return Some(&block_start[..scanned]);
1507        }
1508        if after.is_empty() && line.is_empty() {
1509            // Reached end of input without a closing fence.
1510            return None;
1511        }
1512        scanned += line.len() + 1; // +1 for the consumed '\n'
1513        if after.is_empty() {
1514            return None;
1515        }
1516        rest = after;
1517    }
1518}
1519
1520/// Split a file's text into `(frontmatter, body)` at the leading `---`…`---`
1521/// fence — raw (no YAML parse), so a file with malformed frontmatter is still
1522/// split and fully scanned. `frontmatter` is the text between the fences
1523/// (exclusive); `body` is everything after the closing fence's line. Returns
1524/// `None` when the text does not open with a `---` fence or has no closing
1525/// fence — the caller then treats the whole text as body. Mirrors
1526/// [`frontmatter_block`]'s boundary detection (BOM- and CRLF-tolerant).
1527fn split_frontmatter_raw(text: &str) -> Option<(&str, &str)> {
1528    let stripped = text.strip_prefix('\u{feff}').unwrap_or(text);
1529    let (first, after_first) = split_first_line(stripped);
1530    if first.trim_end() != "---" {
1531        return None;
1532    }
1533    let block_start = after_first;
1534    let mut scanned = 0usize;
1535    let mut rest = after_first;
1536    loop {
1537        let (line, after) = split_first_line(rest);
1538        if line.trim_end() == "---" {
1539            // `after` is the body: everything past the closing fence line.
1540            return Some((&block_start[..scanned], after));
1541        }
1542        if after.is_empty() && line.is_empty() {
1543            return None; // reached EOF with no closing fence
1544        }
1545        scanned += line.len() + 1; // +1 for the consumed '\n'
1546        if after.is_empty() {
1547            return None; // closing fence never found
1548        }
1549        rest = after;
1550    }
1551}
1552
1553/// Split a string into (first line without its trailing `\n`, remainder after
1554/// the `\n`). If there is no newline, the whole string is the line and the
1555/// remainder is empty.
1556fn split_first_line(s: &str) -> (&str, &str) {
1557    match s.find('\n') {
1558        Some(i) => (&s[..i], &s[i + 1..]),
1559        None => (s, ""),
1560    }
1561}
1562
1563/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
1564/// typed columns first and then the flattened `fields` map.
1565fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
1566    match key {
1567        "type" => record.type_ == value,
1568        "summary" => record.summary == value,
1569        "path" => record.path.to_string_lossy() == value,
1570        "created" => timestamp_matches(record.created, value),
1571        "updated" => timestamp_matches(record.updated, value),
1572        "tags" => record.tags.iter().any(|t| t == value),
1573        "links" => record.links.iter().any(|l| l == value),
1574        other => record
1575            .fields
1576            .get(other)
1577            .map(|v| json_value_matches(v, value))
1578            .unwrap_or(false),
1579    }
1580}
1581
1582/// Compare a record's `created`/`updated` instant against a query `value`.
1583///
1584/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1585/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1586/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1587/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1588/// the `Z` form an agent reads straight out of the file would never match. We
1589/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1590/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1591/// can never equal a real timestamp, so it falls through to `false`.
1592fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1593    match (stored, DateTime::parse_from_rfc3339(value)) {
1594        (Some(stored), Ok(queried)) => stored == queried,
1595        _ => false,
1596    }
1597}
1598
1599/// Match a JSON number against a query string.
1600///
1601/// A FLOAT-valued field is compared NUMERICALLY, not textually: the sidecar
1602/// stores a YAML float through serde_json's canonical f64 rendering, which
1603/// discards the file's source spelling (`1234.00` -> `1234.0`, `12.50` ->
1604/// `12.5`, `1e3` -> `1000.0`). A raw `to_string()` compare therefore made the
1605/// spelling a human reads in the file fail to match (and disagreed with
1606/// free-text `search`), while requiring a canonical form often absent from the
1607/// file. We parse the query as f64 and compare values. Restricted to the float
1608/// case so a large INTEGER field never loses exactness to f64 rounding (integers
1609/// render canonically and round-trip exactly through the textual compare).
1610/// Mirrors the parse-then-compare pattern [`timestamp_matches`] already uses.
1611fn number_matches(n: &serde_json::Number, value: &str) -> bool {
1612    if n.to_string() == value {
1613        return true;
1614    }
1615    if n.is_f64() {
1616        if let (Some(stored), Ok(q)) = (n.as_f64(), value.parse::<f64>()) {
1617            return stored == q;
1618        }
1619    }
1620    false
1621}
1622
1623/// Compare a JSON field value against a query string. A string matches
1624/// verbatim; scalars match their textual form; an array matches if any element
1625/// matches (so a list-valued frontmatter field is membership-queried).
1626fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1627    match v {
1628        serde_json::Value::String(s) => s == value,
1629        serde_json::Value::Bool(b) => b.to_string() == value,
1630        serde_json::Value::Number(n) => number_matches(n, value),
1631        serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1632        // A present-but-null field never matches — consistent with the in-memory
1633        // post-filter (`query::json_value_matches`, which the first `where`
1634        // clause is NOT re-checked against, so the two must agree here or a
1635        // `--where field=` query would return different rows than `--type X
1636        // --where field=`).
1637        serde_json::Value::Null => false,
1638        serde_json::Value::Object(_) => false,
1639    }
1640}
1641
1642#[cfg(test)]
1643mod tests {
1644    use super::*;
1645    use std::fs;
1646    use tempfile::{tempdir, TempDir};
1647
1648    // ── Fixtures ────────────────────────────────────────────────────────────
1649
1650    /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1651    /// store-relative path for convenient assertions.
1652    fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1653        let abs = root.join(rel);
1654        fs::create_dir_all(abs.parent().unwrap()).unwrap();
1655        fs::write(&abs, contents).unwrap();
1656        PathBuf::from(rel)
1657    }
1658
1659    /// A minimal content file with the given `updated` timestamp in frontmatter.
1660    fn content_md(updated: &str) -> String {
1661        format!(
1662            "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1663        )
1664    }
1665
1666    /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1667    /// real parser is exercised).
1668    fn empty_store() -> TempDir {
1669        let dir = tempdir().unwrap();
1670        fs::write(
1671            dir.path().join("DB.md"),
1672            "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1673        )
1674        .unwrap();
1675        dir
1676    }
1677
1678    /// Open a store rooted at a TempDir; panics if `open` rejects it.
1679    fn open(dir: &TempDir) -> Store {
1680        Store::open(dir.path()).expect("fixture should be a valid store")
1681    }
1682
1683    fn rels(paths: &[PathBuf]) -> Vec<String> {
1684        paths
1685            .iter()
1686            .map(|p| p.to_string_lossy().replace('\\', "/"))
1687            .collect()
1688    }
1689
1690    // ── Layer ───────────────────────────────────────────────────────────────
1691
1692    #[test]
1693    fn layer_dir_name_and_parse_are_inverse() {
1694        for layer in Layer::all() {
1695            assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1696        }
1697        assert_eq!(Layer::Sources.dir_name(), "sources");
1698        assert_eq!(Layer::Records.dir_name(), "records");
1699        // `wiki` is no longer a layer (the wiki/ layer was removed); it parses to None.
1700        assert_eq!(Layer::from_dir_name("wiki"), None);
1701        assert_eq!(Layer::from_dir_name("log"), None);
1702        assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1703    }
1704
1705    #[test]
1706    fn layer_order_is_canonical() {
1707        // stats keys a BTreeMap on Layer; the sort order must be sources<records.
1708        let mut v = [Layer::Records, Layer::Sources];
1709        v.sort();
1710        assert_eq!(v, [Layer::Sources, Layer::Records]);
1711    }
1712
1713    #[test]
1714    fn is_content_path_is_layer_rooted_and_excludes_non_layer_files() {
1715        // Real content: a `.md` file rooted in a layer's FIRST component.
1716        assert!(is_content_path(Path::new("records/contacts/alice.md")));
1717        assert!(is_content_path(Path::new("sources/emails/2026/05/x.md")));
1718        // Store-root meta files and a bare top-level note are NOT content.
1719        assert!(!is_content_path(Path::new("DB.md")));
1720        assert!(!is_content_path(Path::new("log.md")));
1721        assert!(!is_content_path(Path::new("NOTES.md")));
1722        // Non-layer top-level dirs are NEVER content — even if a DEEPER
1723        // component is named `records`/`sources` (the rename data-loss case).
1724        assert!(!is_content_path(Path::new("scratch/draft.md")));
1725        assert!(!is_content_path(Path::new("EXPECTED/snapshot.md")));
1726        assert!(!is_content_path(Path::new("archive/old.md")));
1727        assert!(!is_content_path(Path::new(
1728            "EXPECTED/records/contacts/x.md"
1729        )));
1730        assert!(!is_content_path(Path::new("archive/sources/emails/y.md")));
1731        // An `index.md` sidecar inside a layer is a catalog, not content.
1732        assert!(!is_content_path(Path::new("records/contacts/index.md")));
1733        // A non-`.md` file inside a layer (e.g. the jsonl sidecar) is not content.
1734        assert!(!is_content_path(Path::new("records/contacts/index.jsonl")));
1735    }
1736
1737    // ── is_db_md_store / open ────────────────────────────────────────────────
1738
1739    #[test]
1740    fn is_store_true_only_with_uppercase_marker() {
1741        let dir = tempdir().unwrap();
1742        assert!(
1743            !Store::is_db_md_store(dir.path()),
1744            "no marker → not a store"
1745        );
1746
1747        fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1748        assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1749    }
1750
1751    #[test]
1752    fn is_store_false_for_lowercase_db_md() {
1753        // The case-sensitivity contract: a lowercase db.md is the spec name, not
1754        // a marker — even on a case-insensitive filesystem where Path::exists
1755        // would lie. This test must pass on macOS (case-insensitive) too.
1756        let dir = tempdir().unwrap();
1757        fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1758        assert!(
1759            !Store::is_db_md_store(dir.path()),
1760            "lowercase db.md must NOT be treated as a store marker"
1761        );
1762        assert!(Store::open(dir.path()).is_err());
1763    }
1764
1765    #[test]
1766    fn is_store_false_when_db_md_is_a_directory() {
1767        let dir = tempdir().unwrap();
1768        fs::create_dir(dir.path().join("DB.md")).unwrap();
1769        assert!(
1770            !Store::is_db_md_store(dir.path()),
1771            "a directory named DB.md is not the file marker"
1772        );
1773    }
1774
1775    #[test]
1776    fn open_rejects_non_store_with_path() {
1777        let dir = tempdir().unwrap();
1778        let err = Store::open(dir.path()).unwrap_err();
1779        assert_eq!(err.path, dir.path());
1780    }
1781
1782    #[test]
1783    fn open_succeeds_and_parses_config() {
1784        let dir = tempdir().unwrap();
1785        // A DB.md whose ## Policies declares a frozen page — proves open()
1786        // actually parsed the config rather than substituting a default.
1787        fs::write(
1788            dir.path().join("DB.md"),
1789            "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1790             ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1791        )
1792        .unwrap();
1793        let store = Store::open(dir.path()).unwrap();
1794        assert_eq!(store.root, dir.path());
1795        assert!(
1796            store
1797                .config
1798                .frozen_pages
1799                .iter()
1800                .any(|p| p == Path::new("records/decisions/q1.md")),
1801            "open() must surface DB.md ## Policies, got {:?}",
1802            store.config.frozen_pages
1803        );
1804    }
1805
1806    // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1807
1808    #[test]
1809    fn walk_collects_content_across_layers_skipping_meta_and_log() {
1810        let dir = empty_store();
1811        let root = dir.path();
1812        write(
1813            root,
1814            "sources/emails/2026/05/a.md",
1815            &content_md("2026-05-01T00:00:00Z"),
1816        );
1817        write(
1818            root,
1819            "records/contacts/sarah.md",
1820            &content_md("2026-05-02T00:00:00Z"),
1821        );
1822        write(
1823            root,
1824            "records/profiles/sarah.md",
1825            &content_md("2026-05-03T00:00:00Z"),
1826        );
1827        // Things walk() must SKIP:
1828        write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1829        write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1830        write(root, "log.md", "---\ntype: log\n---\n"); // log
1831        write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1832        write(
1833            root,
1834            "sources/.hidden/secret.md",
1835            &content_md("2026-05-09T00:00:00Z"),
1836        ); // hidden dir
1837        write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1838
1839        let store = open(&dir);
1840        let got = rels(&store.walk().unwrap());
1841        assert_eq!(
1842            got,
1843            vec![
1844                "records/contacts/sarah.md".to_string(),
1845                "records/profiles/sarah.md".to_string(),
1846                "sources/emails/2026/05/a.md".to_string(),
1847            ]
1848        );
1849    }
1850
1851    #[test]
1852    fn walk_includes_content_named_log_md_or_db_md_inside_a_layer() {
1853        let dir = empty_store();
1854        let root = dir.path();
1855        // A content file that merely happens to be named log.md / DB.md INSIDE a
1856        // layer is real content — those names are reserved only at the store root.
1857        write(
1858            root,
1859            "records/configs/log.md",
1860            &content_md("2026-05-01T00:00:00Z"),
1861        );
1862        write(
1863            root,
1864            "sources/docs/DB.md",
1865            &content_md("2026-05-02T00:00:00Z"),
1866        );
1867        // The derived catalog twin is still skipped at any depth.
1868        write(root, "records/configs/index.md", "---\ntype: index\n---\n");
1869        let store = open(&dir);
1870        let got = rels(&store.walk().unwrap());
1871        assert!(
1872            got.contains(&"records/configs/log.md".to_string()),
1873            "layer-internal log.md is content: {got:?}"
1874        );
1875        assert!(
1876            got.contains(&"sources/docs/DB.md".to_string()),
1877            "layer-internal DB.md is content: {got:?}"
1878        );
1879        assert!(
1880            !got.iter().any(|p| p.ends_with("index.md")),
1881            "index.md is still skipped: {got:?}"
1882        );
1883    }
1884
1885    #[test]
1886    fn walk_layer_is_scoped() {
1887        let dir = empty_store();
1888        let root = dir.path();
1889        write(
1890            root,
1891            "sources/emails/2026/05/a.md",
1892            &content_md("2026-05-01T00:00:00Z"),
1893        );
1894        write(
1895            root,
1896            "records/contacts/sarah.md",
1897            &content_md("2026-05-02T00:00:00Z"),
1898        );
1899        let store = open(&dir);
1900
1901        assert_eq!(
1902            rels(&store.walk_layer(Layer::Sources).unwrap()),
1903            vec!["sources/emails/2026/05/a.md".to_string()]
1904        );
1905        assert_eq!(
1906            rels(&store.walk_layer(Layer::Records).unwrap()),
1907            vec!["records/contacts/sarah.md".to_string()]
1908        );
1909        // A layer with no directory is empty, not an error: a store with only a
1910        // sources/ tree has no records/ dir, so walking Records is empty.
1911        let only_sources = empty_store();
1912        write(
1913            only_sources.path(),
1914            "sources/emails/2026/05/a.md",
1915            &content_md("2026-05-01T00:00:00Z"),
1916        );
1917        let s2 = open(&only_sources);
1918        assert!(s2.walk_layer(Layer::Records).unwrap().is_empty());
1919    }
1920
1921    #[test]
1922    fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1923        let dir = empty_store();
1924        let root = dir.path();
1925        write(
1926            root,
1927            "sources/emails/2026/05/a.md",
1928            &content_md("2026-05-01T00:00:00Z"),
1929        );
1930        write(
1931            root,
1932            "sources/emails/2026/06/b.md",
1933            &content_md("2026-06-01T00:00:00Z"),
1934        );
1935        write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1936                                                                           // A different type folder must not leak in.
1937        write(
1938            root,
1939            "sources/docs/2026/05/c.md",
1940            &content_md("2026-05-04T00:00:00Z"),
1941        );
1942        let store = open(&dir);
1943
1944        let expected = vec![
1945            "sources/emails/2026/05/a.md".to_string(),
1946            "sources/emails/2026/06/b.md".to_string(),
1947        ];
1948        // Relative folder arg.
1949        assert_eq!(
1950            rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1951            expected
1952        );
1953        // Absolute folder arg under the store resolves identically.
1954        assert_eq!(
1955            rels(
1956                &store
1957                    .walk_type_folder(&root.join("sources/emails"))
1958                    .unwrap()
1959            ),
1960            expected
1961        );
1962    }
1963
1964    // ── recent_in_type_folder ────────────────────────────────────────────────
1965
1966    #[test]
1967    fn recent_orders_by_updated_desc_then_path_and_caps() {
1968        let dir = empty_store();
1969        let root = dir.path();
1970        // newest
1971        write(
1972            root,
1973            "records/meetings/2026/05/c.md",
1974            &content_md("2026-05-03T00:00:00Z"),
1975        );
1976        // tie on updated — path asc decides (a before b)
1977        write(
1978            root,
1979            "records/meetings/2026/05/a.md",
1980            &content_md("2026-05-02T00:00:00Z"),
1981        );
1982        write(
1983            root,
1984            "records/meetings/2026/05/b.md",
1985            &content_md("2026-05-02T00:00:00Z"),
1986        );
1987        // oldest
1988        write(
1989            root,
1990            "records/meetings/2026/04/z.md",
1991            &content_md("2026-04-01T00:00:00Z"),
1992        );
1993        let store = open(&dir);
1994
1995        let all = rels(
1996            &store
1997                .recent_in_type_folder(Path::new("records/meetings"), 10)
1998                .unwrap(),
1999        );
2000        assert_eq!(
2001            all,
2002            vec![
2003                "records/meetings/2026/05/c.md".to_string(), // newest
2004                "records/meetings/2026/05/a.md".to_string(), // tie, path asc
2005                "records/meetings/2026/05/b.md".to_string(),
2006                "records/meetings/2026/04/z.md".to_string(), // oldest
2007            ]
2008        );
2009
2010        // Cap takes the n most-recent.
2011        let top2 = rels(
2012            &store
2013                .recent_in_type_folder(Path::new("records/meetings"), 2)
2014                .unwrap(),
2015        );
2016        assert_eq!(
2017            top2,
2018            vec![
2019                "records/meetings/2026/05/c.md".to_string(),
2020                "records/meetings/2026/05/a.md".to_string(),
2021            ]
2022        );
2023    }
2024
2025    #[test]
2026    fn recent_sorts_undated_files_last() {
2027        let dir = empty_store();
2028        let root = dir.path();
2029        write(
2030            root,
2031            "records/contacts/dated.md",
2032            &content_md("2026-05-01T00:00:00Z"),
2033        );
2034        // No `updated` field at all.
2035        write(
2036            root,
2037            "records/contacts/undated.md",
2038            "---\ntype: contact\nsummary: x\n---\nbody\n",
2039        );
2040        let store = open(&dir);
2041        let got = rels(
2042            &store
2043                .recent_in_type_folder(Path::new("records/contacts"), 10)
2044                .unwrap(),
2045        );
2046        assert_eq!(
2047            got,
2048            vec![
2049                "records/contacts/dated.md".to_string(),
2050                "records/contacts/undated.md".to_string(),
2051            ],
2052            "a file with a real `updated` must outrank one with none"
2053        );
2054    }
2055
2056    // ── type_shards ──────────────────────────────────────────────────────────
2057
2058    #[test]
2059    fn type_shards_classification() {
2060        let dir = empty_store();
2061        let store = open(&dir);
2062        for t in [
2063            "email",
2064            "transcript",
2065            "pdf-source",
2066            "expense",
2067            "invoice",
2068            "meeting",
2069            "order",
2070            "ticket",
2071            "transaction",
2072        ] {
2073            assert!(store.type_shards(t), "{t} should shard");
2074        }
2075        for t in [
2076            "contact", "company", "decision", "profile", "index", "log", "db-md", "proposal",
2077        ] {
2078            assert!(!store.type_shards(t), "{t} should stay flat");
2079        }
2080    }
2081
2082    #[test]
2083    fn type_shards_respects_schema_directive_both_directions() {
2084        use crate::parser::{Config, Schema};
2085        let dir = empty_store();
2086        let mut store = open(&dir);
2087        let mut config = Config::default();
2088        // A CUSTOM type (not in the built-in list) opts into date-sharding —
2089        // without the schema override `type_shards` would return false for it.
2090        config.schemas.insert(
2091            "shipment".to_string(),
2092            Schema {
2093                shard: Some(true),
2094                ..Schema::default()
2095            },
2096        );
2097        // A BUILT-IN event type opts OUT (flat) — the override wins over the
2098        // built-in default.
2099        config.schemas.insert(
2100            "expense".to_string(),
2101            Schema {
2102                shard: Some(false),
2103                ..Schema::default()
2104            },
2105        );
2106        // A schema with no `shard:` directive leaves the built-in default intact.
2107        config
2108            .schemas
2109            .insert("meeting".to_string(), Schema::default());
2110        store.config = config;
2111
2112        assert!(
2113            store.type_shards("shipment"),
2114            "custom type with `shard: by-date` must shard"
2115        );
2116        assert!(
2117            !store.type_shards("expense"),
2118            "built-in event type with `shard: flat` must go flat"
2119        );
2120        assert!(
2121            store.type_shards("meeting"),
2122            "schema without a `shard:` directive keeps the built-in default"
2123        );
2124        assert!(
2125            !store.type_shards("contact"),
2126            "unconfigured entity type stays flat"
2127        );
2128    }
2129
2130    // ── year_month_from_str ──────────────────────────────────────────────────
2131
2132    #[test]
2133    fn year_month_from_str_accepts_unpadded_month() {
2134        // A single-digit month shards to the same zero-padded folder as its twin,
2135        // matching the lenient `date`-shape validator (chrono `%Y-%m-%d`).
2136        let ym = year_month_from_str;
2137        assert_eq!(
2138            ym("2026-1-15"),
2139            Some(("2026".to_string(), "01".to_string())),
2140        );
2141        assert_eq!(
2142            ym("2026-01-15"),
2143            Some(("2026".to_string(), "01".to_string())),
2144        );
2145        assert_eq!(
2146            ym("2026-12-5"),
2147            Some(("2026".to_string(), "12".to_string())),
2148        );
2149        assert_eq!(ym("2026-1"), Some(("2026".to_string(), "01".to_string())));
2150        // Full timestamps still parse off the leading date.
2151        assert_eq!(
2152            ym("2026-3-22T10:00:00-07:00"),
2153            Some(("2026".to_string(), "03".to_string())),
2154        );
2155    }
2156
2157    #[test]
2158    fn year_month_from_str_rejects_non_dates() {
2159        // Genuinely non-date input still returns None (behavior unchanged).
2160        assert_eq!(year_month_from_str(""), None);
2161        assert_eq!(year_month_from_str("not-a-date"), None);
2162        assert_eq!(year_month_from_str("2026"), None); // no month part
2163        assert_eq!(year_month_from_str("26-1-15"), None); // year not 4 digits
2164        assert_eq!(year_month_from_str("2026-13-01"), None); // month out of range
2165        assert_eq!(year_month_from_str("2026-0-01"), None); // month zero
2166        assert_eq!(year_month_from_str("2026-001-01"), None); // month over 2 digits
2167        assert_eq!(year_month_from_str("2026-x-01"), None); // non-numeric month
2168        assert_eq!(year_month_from_str("20a6-1-15"), None); // non-numeric year
2169    }
2170
2171    #[test]
2172    fn shard_path_accepts_unpadded_month_same_as_padded() {
2173        // End-to-end: an unpadded `date` shards to its real month, identically to
2174        // its zero-padded twin — not to the `created`-fallback month.
2175        let dir = empty_store();
2176        let store = open(&dir);
2177
2178        let padded = store
2179            .shard_path_for("expense", &fm_with_extra("date", "2026-01-15"), "padded")
2180            .unwrap();
2181        assert_eq!(padded, PathBuf::from("records/expenses/2026/01/padded.md"));
2182
2183        let single = store
2184            .shard_path_for("expense", &fm_with_extra("date", "2026-1-15"), "single")
2185            .unwrap();
2186        assert_eq!(single, PathBuf::from("records/expenses/2026/01/single.md"));
2187    }
2188
2189    // ── shard_path_for ───────────────────────────────────────────────────────
2190
2191    fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
2192        let mut fm = Frontmatter::default();
2193        fm.extra.insert(
2194            key.to_string(),
2195            serde_norway::Value::String(value.to_string()),
2196        );
2197        fm
2198    }
2199
2200    fn fm_with_created(rfc3339: &str) -> Frontmatter {
2201        Frontmatter {
2202            created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
2203            ..Default::default()
2204        }
2205    }
2206
2207    #[test]
2208    fn shard_path_uses_primary_date_field_per_type() {
2209        let dir = empty_store();
2210        let store = open(&dir);
2211
2212        // expense.date → records/expenses/<YYYY>/<MM>/
2213        let p = store
2214            .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
2215            .unwrap();
2216        assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
2217
2218        // email.date → sources/emails/<YYYY>/<MM>/
2219        let p = store
2220            .shard_path_for(
2221                "email",
2222                &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
2223                "e1",
2224            )
2225            .unwrap();
2226        assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
2227
2228        // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
2229        let p = store
2230            .shard_path_for(
2231                "transcript",
2232                &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
2233                "t1",
2234            )
2235            .unwrap();
2236        assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
2237    }
2238
2239    #[test]
2240    fn shard_path_falls_back_to_created() {
2241        let dir = empty_store();
2242        let store = open(&dir);
2243        // meeting with no `date` field but a `created` timestamp.
2244        let p = store
2245            .shard_path_for(
2246                "meeting",
2247                &fm_with_created("2024-07-09T08:30:00-04:00"),
2248                "sync",
2249            )
2250            .unwrap();
2251        assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
2252    }
2253
2254    #[test]
2255    fn shard_path_primary_field_wins_over_created() {
2256        let dir = empty_store();
2257        let store = open(&dir);
2258        let mut fm = fm_with_created("2020-01-01T00:00:00Z");
2259        fm.extra.insert(
2260            "date".into(),
2261            serde_norway::Value::String("2026-05-22".into()),
2262        );
2263        let p = store.shard_path_for("expense", &fm, "x").unwrap();
2264        // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
2265        assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
2266    }
2267
2268    #[test]
2269    fn shard_path_flat_types_have_no_shard_segment() {
2270        let dir = empty_store();
2271        let store = open(&dir);
2272        // A contact has a `created` date, but contacts stay flat.
2273        let p = store
2274            .shard_path_for(
2275                "contact",
2276                &fm_with_created("2026-05-22T00:00:00Z"),
2277                "sarah-chen",
2278            )
2279            .unwrap();
2280        assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
2281
2282        // A conclusion `profile` is a custom (non-built-in) type: it is flat (no
2283        // date shard) and lands under the records-layer fallback folder
2284        // `records/<type>` — `records/profile/<name>.md`, a conforming 3-component
2285        // `<layer>/<type-folder>/<file>` path. A 2-component path would be
2286        // invisible to the index/validate type-folder model.
2287        let p = store
2288            .shard_path_for("profile", &Frontmatter::default(), "renewal-theme")
2289            .unwrap();
2290        assert_eq!(p, PathBuf::from("records/profile/renewal-theme.md"));
2291    }
2292
2293    /// Regression: a type written through the toolkit's own path computation
2294    /// must land at a path the index + validate type-folder model accepts. A
2295    /// 2-component `<layer>/<file>` path is one `type_folder_of` (in both `index`
2296    /// and `validate`) treats as "no type-folder" — it would either crash
2297    /// `Index::on_write` (it tried to create `index.md` inside a file) or be
2298    /// silently dropped from every catalog by `Index::rebuild_all`. A custom
2299    /// (non-built-in) type like a conclusion `profile` falls back to
2300    /// `records/<type>` — still a conforming 3-component
2301    /// `<layer>/<type-folder>/<file>` path.
2302    #[test]
2303    fn shard_path_custom_type_is_indexable_three_component_path() {
2304        let dir = empty_store();
2305        let store = open(&dir);
2306        let p = store
2307            .shard_path_for("profile", &Frontmatter::default(), "renewal-theme")
2308            .unwrap();
2309        // First two components are a layer + a non-empty type-folder segment;
2310        // the file is the third. This is exactly the shape `type_folder_of`
2311        // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
2312        let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
2313        assert_eq!(
2314            comps.len(),
2315            3,
2316            "custom-type path must be <layer>/<type-folder>/<file>, got {p:?}"
2317        );
2318        assert_eq!(
2319            comps[0], "records",
2320            "first component must be the records layer (a custom type is \
2321             filed under the records fallback)"
2322        );
2323        assert!(
2324            !comps[1].is_empty() && comps[1] != "renewal-theme.md",
2325            "second component must be a real type-folder, not the file: {p:?}"
2326        );
2327        assert!(
2328            comps[2].ends_with(".md"),
2329            "third component must be the .md file: {p:?}"
2330        );
2331    }
2332
2333    #[test]
2334    fn shard_path_preserves_and_adds_md_extension() {
2335        let dir = empty_store();
2336        let store = open(&dir);
2337        let with = store
2338            .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
2339            .unwrap();
2340        let without = store
2341            .shard_path_for("contact", &Frontmatter::default(), "sarah")
2342            .unwrap();
2343        assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
2344        assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
2345    }
2346
2347    #[test]
2348    fn shard_path_errors_when_sharding_type_has_no_date() {
2349        let dir = empty_store();
2350        let store = open(&dir);
2351        // expense shards, but no `date` and no `created` → NoShardDate.
2352        let err = store
2353            .shard_path_for("expense", &Frontmatter::default(), "mystery")
2354            .unwrap_err();
2355        match err {
2356            StoreError::NoShardDate { file } => {
2357                assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
2358            }
2359            other => panic!("expected NoShardDate, got {other:?}"),
2360        }
2361    }
2362
2363    // ── find_links_to ────────────────────────────────────────────────────────
2364
2365    #[test]
2366    fn find_links_to_matches_all_accepted_spellings() {
2367        let dir = empty_store();
2368        let root = dir.path();
2369        let target = "records/contacts/sarah-chen";
2370
2371        // Plain link.
2372        write(
2373            root,
2374            "records/profiles/sarah.md",
2375            &format!(
2376                "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\nSee [[{target}]].\n"
2377            ),
2378        );
2379        // Link with display text.
2380        write(
2381            root,
2382            "records/meetings/2026/05/m.md",
2383            &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
2384        );
2385        // Link with .md extension (accepted, warned by validate).
2386        write(
2387            root,
2388            "records/concepts/t.md",
2389            &format!(
2390                "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[{target}.md]]\n"
2391            ),
2392        );
2393        // A catalog/index file also contains the link literally — included.
2394        write(
2395            root,
2396            "records/contacts/index.md",
2397            &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
2398        );
2399        // No link to the target.
2400        write(
2401            root,
2402            "records/profiles/elena.md",
2403            "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\nNo links here.\n",
2404        );
2405        // Short-form link must NOT match the full-path target.
2406        write(
2407            root,
2408            "records/profiles/bob.md",
2409            "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\n[[sarah-chen]]\n",
2410        );
2411        // A longer path that merely starts with the target must NOT match
2412        // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
2413        write(
2414            root,
2415            "records/profiles/jr.md",
2416            &format!(
2417                "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\n[[{target}-jr]]\n"
2418            ),
2419        );
2420
2421        let store = open(&dir);
2422        let got = rels(&store.find_links_to(Path::new(target)).unwrap());
2423        assert_eq!(
2424            got,
2425            vec![
2426                "records/concepts/t.md".to_string(),
2427                "records/contacts/index.md".to_string(),
2428                "records/meetings/2026/05/m.md".to_string(),
2429                "records/profiles/sarah.md".to_string(),
2430            ]
2431        );
2432    }
2433
2434    #[test]
2435    fn find_links_to_distinguishes_sibling_paths() {
2436        // Two contacts whose paths share a prefix; a link to one must not be
2437        // reported as a link to the other.
2438        let dir = empty_store();
2439        let root = dir.path();
2440        write(
2441            root,
2442            "records/concepts/a.md",
2443            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[records/contacts/sarah]]\n",
2444        );
2445        write(
2446            root,
2447            "records/concepts/b.md",
2448            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
2449        );
2450        let store = open(&dir);
2451
2452        assert_eq!(
2453            rels(
2454                &store
2455                    .find_links_to(Path::new("records/contacts/sarah"))
2456                    .unwrap()
2457            ),
2458            vec!["records/concepts/a.md".to_string()]
2459        );
2460        assert_eq!(
2461            rels(
2462                &store
2463                    .find_links_to(Path::new("records/contacts/sarah-chen"))
2464                    .unwrap()
2465            ),
2466            vec!["records/concepts/b.md".to_string()]
2467        );
2468    }
2469
2470    #[test]
2471    fn regression_find_links_to_tolerates_invalid_utf8_on_a_matched_line() {
2472        // Regression: a `.md` file can carry a stray non-UTF-8 byte on the SAME
2473        // line as a `[[target]]` link (a verbatim-ingested `sources/` artifact,
2474        // e.g. a mis-decoded Latin-1 import). The scan must still report the
2475        // link — `find_links_to` / `find_links_to_any` (and `graph backlinks` +
2476        // the working-set validate incoming-linker pass) must not error out and
2477        // drop the legitimate UTF-8 linkers. The content scan reads the file
2478        // with `String::from_utf8_lossy`, so the invalid byte becomes a
2479        // replacement char and the ASCII `[[target]]` link is still extracted.
2480        let dir = empty_store();
2481        let root = dir.path();
2482        let target = "records/contacts/sarah-chen";
2483
2484        // A clean, fully-UTF-8 linker that MUST be returned regardless.
2485        write(
2486            root,
2487            "records/profiles/clean.md",
2488            &format!(
2489                "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\nSee [[{target}]].\n"
2490            ),
2491        );
2492
2493        // A linker whose link line ALSO carries a stray 0xFF byte (a mis-decoded
2494        // Latin-1 import). Write raw bytes so the invalid byte survives — a
2495        // `&str` fixture could not express it. The byte-level regex still
2496        // matches `[[target]]` on this line; pre-fix the UTF8 sink aborted here.
2497        let mut bytes: Vec<u8> =
2498            b"---\ntype: email\nsummary: s\n---\nSee [[records/contacts/sarah-chen]] \xFF here\n"
2499                .to_vec();
2500        let dirty_abs = root.join("sources/emails/2026/05/raw.md");
2501        fs::create_dir_all(dirty_abs.parent().unwrap()).unwrap();
2502        fs::write(&dirty_abs, &bytes).unwrap();
2503        // Defensive: confirm the fixture really is invalid UTF-8 (so the test
2504        // exercises the bug, not a coincidentally-valid file).
2505        assert!(
2506            std::str::from_utf8(&bytes).is_err(),
2507            "fixture must contain invalid UTF-8 to exercise the regression"
2508        );
2509        bytes.clear();
2510
2511        let store = open(&dir);
2512        let got = rels(
2513            &store
2514                .find_links_to(Path::new(target))
2515                .expect("a stray non-UTF-8 byte must not abort the backlink scan"),
2516        );
2517        assert_eq!(
2518            got,
2519            vec![
2520                "records/profiles/clean.md".to_string(),
2521                "sources/emails/2026/05/raw.md".to_string(),
2522            ],
2523            "both the clean linker and the one with an invalid byte on the link \
2524             line are reported; the scan degrades, it does not fail"
2525        );
2526    }
2527
2528    // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
2529
2530    /// The working-set validate's incoming-linker discovery runs through
2531    /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
2532    /// batch contract that makes that single-pass behavior correct: the result is
2533    /// the union of incoming linkers across every target, with per-target
2534    /// boundary correctness preserved (no alternation arm bleeds into a
2535    /// prefix-sharing sibling). If a regression reverts the batch finder to a
2536    /// per-object loop, the union below would still hold — but the boundary +
2537    /// union-equivalence assertions are what guard the *correctness* of folding N
2538    /// scans into one regex.
2539    #[test]
2540    fn find_links_to_any_returns_the_union_with_boundary_correctness() {
2541        let dir = empty_store();
2542        let root = dir.path();
2543
2544        // Two distinct targets, each with its own linker.
2545        write(
2546            root,
2547            "records/concepts/links-sarah.md",
2548            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
2549        );
2550        write(
2551            root,
2552            "records/concepts/links-acme.md",
2553            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
2554        );
2555        // One file links to BOTH targets — must appear exactly once (deduped),
2556        // proving the per-file early-exit folds multiple-target hits into a
2557        // single result row rather than one row per matched target.
2558        write(
2559            root,
2560            "records/meetings/2026/05/m.md",
2561            "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
2562             [[records/companies/acme]]\n",
2563        );
2564        // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
2565        // be reported as a link to `sarah-chen` even though the alternation now
2566        // carries `sarah-chen` as one arm.
2567        write(
2568            root,
2569            "records/concepts/links-jr.md",
2570            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
2571        );
2572        // A file that links to neither requested target.
2573        write(
2574            root,
2575            "records/concepts/unrelated.md",
2576            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[records/concepts/spend]]\n",
2577        );
2578
2579        let store = open(&dir);
2580        let targets = vec![
2581            PathBuf::from("records/contacts/sarah-chen"),
2582            PathBuf::from("records/companies/acme"),
2583        ];
2584
2585        let got = rels(&store.find_links_to_any(&targets).unwrap());
2586        assert_eq!(
2587            got,
2588            vec![
2589                "records/concepts/links-acme.md".to_string(),
2590                "records/concepts/links-sarah.md".to_string(),
2591                "records/meetings/2026/05/m.md".to_string(),
2592            ],
2593            "batch finder must return the deduped union of linkers across all \
2594             targets, excluding the prefix-sibling and the unrelated file"
2595        );
2596
2597        // Equivalence: the batch result must equal the union of the per-target
2598        // single finder. This is the property the working-set path relies on
2599        // when it folds one-scan-per-object into one scan for the whole set.
2600        let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
2601        for t in &targets {
2602            for linker in store.find_links_to(t).unwrap() {
2603                union.insert(linker);
2604            }
2605        }
2606        assert_eq!(
2607            rels(&union.into_iter().collect::<Vec<_>>()),
2608            got,
2609            "find_links_to_any must equal the union of per-target find_links_to"
2610        );
2611    }
2612
2613    /// An empty target set must scan nothing and find nothing — and crucially
2614    /// must NOT compile to a match-everything empty regex (which would report
2615    /// every `.md` as a linker). This is the empty-working-set fast path the
2616    /// `validate` loop hits when nothing changed.
2617    #[test]
2618    fn find_links_to_any_empty_targets_matches_nothing() {
2619        let dir = empty_store();
2620        let root = dir.path();
2621        write(
2622            root,
2623            "records/concepts/a.md",
2624            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
2625        );
2626        let store = open(&dir);
2627
2628        assert!(
2629            store.find_links_to_any(&[]).unwrap().is_empty(),
2630            "no targets ⇒ no linkers (an empty pattern must not match every file)"
2631        );
2632        // A set of only empty/non-link targets is likewise a no-op, not a
2633        // match-everything.
2634        assert!(
2635            store
2636                .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
2637                .unwrap()
2638                .is_empty(),
2639            "targets that render to empty link text contribute no alternation arm"
2640        );
2641    }
2642
2643    // ── read_type_index ──────────────────────────────────────────────────────
2644
2645    #[test]
2646    fn read_type_index_parses_records_and_flattens_fields() {
2647        let dir = empty_store();
2648        let root = dir.path();
2649        let jsonl = "\
2650{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
2651{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
2652";
2653        let p = write(root, "records/expenses/index.jsonl", jsonl);
2654        let store = open(&dir);
2655        let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
2656
2657        assert_eq!(recs.len(), 2);
2658        // Sorted by path asc.
2659        assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
2660        assert_eq!(recs[0].type_, "expense");
2661        assert_eq!(recs[0].summary, "lunch");
2662        assert_eq!(recs[0].tags, vec!["meals".to_string()]);
2663        assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
2664        assert!(recs[0].created.is_some());
2665        // Extra (non-typed) frontmatter flattens into `fields`.
2666        assert_eq!(
2667            recs[0].fields.get("vendor"),
2668            Some(&serde_json::json!("acme"))
2669        );
2670        assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
2671        // Defaults: missing tags/links → empty.
2672        assert!(recs[1].tags.is_empty());
2673        assert!(recs[1].links.is_empty());
2674    }
2675
2676    #[test]
2677    fn read_type_index_last_write_wins_and_skips_blanks() {
2678        let dir = empty_store();
2679        let root = dir.path();
2680        // Same path twice; the second line supersedes the first. A blank line
2681        // in between must be ignored, not error.
2682        let jsonl = "\
2683{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
2684
2685{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
2686";
2687        let p = write(root, "records/contacts/index.jsonl", jsonl);
2688        let store = open(&dir);
2689        let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
2690        assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
2691        assert_eq!(recs[0].summary, "new", "later line must win");
2692    }
2693
2694    #[test]
2695    fn read_type_index_errors_on_malformed_line() {
2696        let dir = empty_store();
2697        let root = dir.path();
2698        let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
2699        let store = open(&dir);
2700        let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
2701        assert!(matches!(err, StoreError::BadTypeIndex { .. }));
2702    }
2703
2704    // ── find_by_type / find_by_where ─────────────────────────────────────────
2705
2706    fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
2707        format!(
2708            "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
2709        )
2710    }
2711
2712    #[test]
2713    fn find_by_type_reads_canonical_folder_sidecar() {
2714        let dir = empty_store();
2715        let root = dir.path();
2716        // Canonical folder for `contact` is records/contacts.
2717        write(
2718            root,
2719            "records/contacts/index.jsonl",
2720            &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
2721                + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
2722        );
2723        // A different type's sidecar must not leak into a contact query.
2724        write(
2725            root,
2726            "records/companies/index.jsonl",
2727            &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2728        );
2729        let store = open(&dir);
2730        let recs = store.find_by_type("contact").unwrap();
2731        let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
2732        assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
2733        assert!(recs.iter().all(|r| r.type_ == "contact"));
2734    }
2735
2736    #[test]
2737    fn regression_find_by_type_includes_non_canonical_folder_when_canonical_exists() {
2738        // Regression for the silent-incompleteness bug: once the canonical
2739        // type-folder sidecar exists, `find_by_type` used to read ONLY that
2740        // sidecar and drop same-type records filed in a non-canonical folder in
2741        // the SAME layer — so the result flipped to incomplete the moment a
2742        // canonical record was added. The write path actively enables such a
2743        // layout (`records/clients/` for a `contact`, any `records/<folder>/`
2744        // for a conclusion `profile`), so this is a reachable, dedup-breaking
2745        // omission.
2746        let dir = empty_store();
2747        let root = dir.path();
2748
2749        // CANONICAL folder sidecar exists (`records/contacts/` for `contact`),
2750        // which is exactly the condition that triggered the bug.
2751        write(
2752            root,
2753            "records/contacts/index.jsonl",
2754            &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2755        );
2756        // A `contact` filed in a NON-canonical folder within the same (Records)
2757        // layer. Pre-fix this was silently dropped because the canonical
2758        // sidecar existed; it must now come back.
2759        write(
2760            root,
2761            "records/clients/index.jsonl",
2762            &jsonl_line("records/clients/elena.md", "contact", "Elena", ""),
2763        );
2764        // A different type in the same layer must NOT leak in (proves the read
2765        // is type-filtered, not just a blind whole-layer dump).
2766        write(
2767            root,
2768            "records/companies/index.jsonl",
2769            &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2770        );
2771
2772        let store = open(&dir);
2773        let got: std::collections::BTreeSet<String> = store
2774            .find_by_type("contact")
2775            .unwrap()
2776            .into_iter()
2777            .map(|r| r.path.to_string_lossy().into_owned())
2778            .collect();
2779        assert_eq!(
2780            got,
2781            ["records/clients/elena.md", "records/contacts/sarah.md"]
2782                .into_iter()
2783                .map(String::from)
2784                .collect::<std::collections::BTreeSet<_>>(),
2785            "both the canonical-folder and the non-canonical-folder contact must \
2786             be returned; the company record must be excluded"
2787        );
2788    }
2789
2790    #[test]
2791    fn regression_find_by_type_profile_spans_multiple_topic_folders() {
2792        // Regression for the scoped-backlinks variant of the same bug
2793        // (`graph backlinks --type <conclusion-type>`): a conclusion type like
2794        // `profile` has the canonical fallback folder `records/profile`, but the
2795        // agent may file profiles under ANY records topic folder
2796        // (`records/people/`, `records/clients/`, …). With a
2797        // `records/profile/index.jsonl` present, the old code read only that
2798        // folder and dropped profiles in the other topic folders —
2799        // under-reporting dependents in a blast-radius check. The
2800        // whole-`records/`-layer read must surface all of them.
2801        let dir = empty_store();
2802        let root = dir.path();
2803        write(
2804            root,
2805            "records/profile/index.jsonl",
2806            &jsonl_line("records/profile/billing.md", "profile", "Billing", ""),
2807        );
2808        write(
2809            root,
2810            "records/people/index.jsonl",
2811            &jsonl_line("records/people/sarah-chen.md", "profile", "Sarah Chen", ""),
2812        );
2813        write(
2814            root,
2815            "records/clients/index.jsonl",
2816            &jsonl_line("records/clients/atlas.md", "profile", "Atlas", ""),
2817        );
2818
2819        let store = open(&dir);
2820        let got: std::collections::BTreeSet<String> = store
2821            .find_by_type("profile")
2822            .unwrap()
2823            .into_iter()
2824            .map(|r| r.path.to_string_lossy().into_owned())
2825            .collect();
2826        assert_eq!(
2827            got,
2828            [
2829                "records/clients/atlas.md",
2830                "records/people/sarah-chen.md",
2831                "records/profile/billing.md",
2832            ]
2833            .into_iter()
2834            .map(String::from)
2835            .collect::<std::collections::BTreeSet<_>>(),
2836            "a profile query must return records from every topic folder, not \
2837             just the canonical records/profile/"
2838        );
2839    }
2840
2841    #[test]
2842    fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
2843        let dir = empty_store();
2844        let root = dir.path();
2845        // A custom `proposal` record filed under a non-canonical folder NAME
2846        // (the natural plural `records/proposals/`) inside the records layer.
2847        // `default_type_folder("proposal")` = `records/proposal` (bare type, no
2848        // pluralization guess), so the canonical sidecar does not exist and
2849        // `find_by_type` falls back. The fallback is bounded to the type's
2850        // layer (records), so this record — same layer, non-canonical folder —
2851        // is still found: completeness within the layer holds.
2852        write(
2853            root,
2854            "records/proposals/index.jsonl",
2855            &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
2856        );
2857        // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
2858        // old whole-store fallback read every sidecar in the store and would
2859        // have leaked this into the result; the layer-bounded fallback must not.
2860        // It also pins that the fallback is O(entities-in-layer), never O(store).
2861        write(
2862            root,
2863            "sources/proposals/index.jsonl",
2864            &jsonl_line(
2865                "sources/proposals/leak.md",
2866                "proposal",
2867                "cross-layer decoy",
2868                "",
2869            ),
2870        );
2871        let store = open(&dir);
2872        let recs = store.find_by_type("proposal").unwrap();
2873        assert_eq!(
2874            recs.len(),
2875            1,
2876            "only the records-layer proposal, not the sources decoy"
2877        );
2878        assert_eq!(recs[0].summary, "Q3 proposal");
2879        assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
2880    }
2881
2882    #[test]
2883    fn find_by_type_canonical_absent_does_not_read_other_layers() {
2884        let dir = empty_store();
2885        let root = dir.path();
2886        // `email`'s canonical folder is `sources/emails` (layer Sources). No
2887        // sidecar there yet, so `find_by_type("email")` falls back — but only
2888        // within the Sources layer. A populated sidecar in the Records layer
2889        // must never be touched: the fallback is layer-bounded, not store-wide.
2890        // Under the old `read_all_type_indexes_in(None)` fallback this records
2891        // sidecar would have been read and filtered (wasted O(store) I/O); now
2892        // it is outside the walk root entirely.
2893        write(
2894            root,
2895            "records/contacts/index.jsonl",
2896            &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2897        );
2898        let store = open(&dir);
2899        // No email anywhere ⇒ empty, and the records layer was not in scope.
2900        assert!(store.find_by_type("email").unwrap().is_empty());
2901    }
2902
2903    #[test]
2904    fn find_by_where_matches_typed_columns_and_flat_fields() {
2905        let dir = empty_store();
2906        let root = dir.path();
2907        write(
2908            root,
2909            "records/expenses/index.jsonl",
2910            &(jsonl_line(
2911                "records/expenses/a.md",
2912                "expense",
2913                "lunch",
2914                ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
2915            ) + &jsonl_line(
2916                "records/expenses/b.md",
2917                "expense",
2918                "taxi",
2919                ",\"vendor\":\"yellow\"",
2920            )),
2921        );
2922        write(
2923            root,
2924            "records/contacts/index.jsonl",
2925            &jsonl_line(
2926                "records/contacts/sarah.md",
2927                "contact",
2928                "Sarah",
2929                ",\"tags\":[\"customer\"]",
2930            ),
2931        );
2932        let store = open(&dir);
2933
2934        // Flat field in `fields`.
2935        let by_vendor = store.find_by_where("vendor", "acme").unwrap();
2936        assert_eq!(by_vendor.len(), 1);
2937        assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
2938
2939        // Typed column: type (spans both expense records).
2940        assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
2941
2942        // Typed list column: tags membership.
2943        let customers = store.find_by_where("tags", "customer").unwrap();
2944        assert_eq!(customers.len(), 1);
2945        assert_eq!(
2946            customers[0].path,
2947            PathBuf::from("records/contacts/sarah.md")
2948        );
2949
2950        // No match → empty.
2951        assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2952    }
2953
2954    #[test]
2955    fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2956        let dir = empty_store();
2957        let root = dir.path();
2958        // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2959        // serialized from such a file preserves it verbatim.
2960        write(
2961            root,
2962            "records/meetings/index.jsonl",
2963            "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2964\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2965\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2966        );
2967        let store = open(&dir);
2968
2969        // The exact value an agent reads out of the file (`Z` form) must match.
2970        let by_z = store
2971            .find_by_where("created", "2026-05-01T00:00:00Z")
2972            .unwrap();
2973        assert_eq!(by_z.len(), 1);
2974        assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2975
2976        // The equivalent explicit-offset spelling of the same instant matches too.
2977        assert_eq!(
2978            store
2979                .find_by_where("created", "2026-05-01T00:00:00+00:00")
2980                .unwrap()
2981                .len(),
2982            1
2983        );
2984
2985        // A non-UTC stored value matches both its own offset spelling and the
2986        // same instant expressed as `Z` (instant comparison, not string compare).
2987        assert_eq!(
2988            store
2989                .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2990                .unwrap()
2991                .len(),
2992            1
2993        );
2994        assert_eq!(
2995            store
2996                .find_by_where("updated", "2026-05-02T16:30:00Z")
2997                .unwrap()
2998                .len(),
2999            1
3000        );
3001
3002        // A different instant does not match.
3003        assert!(store
3004            .find_by_where("created", "2026-05-01T00:00:01Z")
3005            .unwrap()
3006            .is_empty());
3007        // A non-RFC3339 query value never matches a real timestamp.
3008        assert!(store
3009            .find_by_where("created", "2026-05-01")
3010            .unwrap()
3011            .is_empty());
3012    }
3013
3014    #[test]
3015    fn find_by_where_matches_floats_across_serialized_spellings() {
3016        // Adversarial review #5: a float field is stored in index.jsonl via
3017        // serde_json's canonical f64 render, which DISCARDS the file's source
3018        // spelling (`1234.00` -> `1234.0`, `1e3` -> `1000.0`). A textual compare
3019        // made the spelling a human reads in the file miss (and disagree with
3020        // free-text `search`); numeric compare fixes it. `query`
3021        // is the SPEC pre-write dedup primitive, so a miss here silently writes a
3022        // duplicate record.
3023        let dir = empty_store();
3024        let root = dir.path();
3025        write(
3026            root,
3027            "records/invoices/index.jsonl",
3028            "{\"path\":\"records/invoices/inv.md\",\"type\":\"invoice\",\
3029\"summary\":\"inv\",\"amount\":1234.0,\"score\":1000.0,\"count\":42}\n",
3030        );
3031        let store = open(&dir);
3032
3033        // Every spelling of the same numeric value matches the canonical-f64 store.
3034        for spelling in ["1234.00", "1234.0", "1234"] {
3035            assert_eq!(
3036                store.find_by_where("amount", spelling).unwrap().len(),
3037                1,
3038                "amount spelling `{spelling}` must match the stored 1234.0"
3039            );
3040        }
3041        for spelling in ["1e3", "1000", "1000.0"] {
3042            assert_eq!(
3043                store.find_by_where("score", spelling).unwrap().len(),
3044                1,
3045                "score spelling `{spelling}` must match the stored 1000.0"
3046            );
3047        }
3048        // A genuinely different value does not match.
3049        assert!(store.find_by_where("amount", "1234.5").unwrap().is_empty());
3050        // Integer fields keep exact textual matching (unaffected by the fix).
3051        assert_eq!(store.find_by_where("count", "42").unwrap().len(), 1);
3052    }
3053
3054    #[test]
3055    fn number_matches_is_numeric_for_floats_but_exact_for_integers() {
3056        use serde_json::Number;
3057        // Float-valued field: any equal spelling matches (the bug fix).
3058        let f: Number = serde_json::from_str("1234.0").unwrap();
3059        assert!(number_matches(&f, "1234.00"));
3060        assert!(number_matches(&f, "1234"));
3061        assert!(number_matches(&f, "1234.0"));
3062        assert!(!number_matches(&f, "1234.5"));
3063        // Integer-valued field: EXACT textual compare, never f64-rounded — two
3064        // adjacent large integers that round to the same f64 must NOT collide
3065        // (the safety property that motivates restricting numeric compare to
3066        // floats).
3067        let big: Number = serde_json::from_str("18446744073709551615").unwrap(); // u64::MAX
3068        assert!(number_matches(&big, "18446744073709551615"));
3069        assert!(!number_matches(&big, "18446744073709551614"));
3070    }
3071
3072    #[test]
3073    fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
3074        // The O(entities-in-layer) contract: a layer-scoped where read must walk
3075        // ONLY the named layer's subtree. Proven structurally — a *malformed*
3076        // sidecar in another layer would make `read_type_index` error if it were
3077        // read, so a scoped read that succeeds (and excludes that record) is
3078        // proof the other layer's I/O never happened.
3079        let dir = empty_store();
3080        let root = dir.path();
3081        write(
3082            root,
3083            "records/companies/index.jsonl",
3084            &jsonl_line(
3085                "records/companies/acme.md",
3086                "company",
3087                "Acme",
3088                ",\"domain\":\"acme.com\"",
3089            ),
3090        );
3091        // Same field/value in the sources layer — but the sidecar is corrupt.
3092        write(
3093            root,
3094            "sources/emails/index.jsonl",
3095            "{ this is not valid json and would error if read }\n",
3096        );
3097        let store = open(&dir);
3098
3099        // Scoped to records: the corrupt sources sidecar is out of scope, so the
3100        // read succeeds and returns only the records-layer match.
3101        let in_records = store
3102            .find_by_where_in("domain", "acme.com", Some(Layer::Records))
3103            .expect("a records-scoped read must not touch the sources sidecar");
3104        assert_eq!(
3105            rels(
3106                &in_records
3107                    .iter()
3108                    .map(|r| r.path.clone())
3109                    .collect::<Vec<_>>()
3110            ),
3111            vec!["records/companies/acme.md".to_string()]
3112        );
3113
3114        // The store-wide read DOES reach the corrupt sidecar and surfaces it as
3115        // a parse error — confirming the corrupt file is genuinely in the tree
3116        // and that only the layer scope spares it.
3117        let store_wide = store.find_by_where("domain", "acme.com");
3118        assert!(
3119            matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
3120            "unscoped read walks every layer and hits the corrupt sidecar"
3121        );
3122
3123        // Scoping to the layer that holds only the corrupt sidecar still errors
3124        // (the scope includes it), proving the scope is a real subtree bound and
3125        // not a silent "skip anything that fails".
3126        let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
3127        assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
3128    }
3129
3130    #[test]
3131    fn find_by_where_in_missing_layer_is_empty_not_an_error() {
3132        // A layer-scoped read over a layer folder that does not exist yet must
3133        // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
3134        // error from `ignore` over a nonexistent path.
3135        let dir = empty_store();
3136        let root = dir.path();
3137        write(
3138            root,
3139            "records/contacts/index.jsonl",
3140            &jsonl_line(
3141                "records/contacts/sarah.md",
3142                "contact",
3143                "Sarah",
3144                ",\"city\":\"denver\"",
3145            ),
3146        );
3147        let store = open(&dir);
3148
3149        // `sources/` was never created.
3150        let in_sources = store
3151            .find_by_where_in("city", "denver", Some(Layer::Sources))
3152            .expect("missing layer subtree is empty, not an error");
3153        assert!(in_sources.is_empty());
3154
3155        // Same query scoped to the layer that has the record still finds it.
3156        let in_records = store
3157            .find_by_where_in("city", "denver", Some(Layer::Records))
3158            .unwrap();
3159        assert_eq!(in_records.len(), 1);
3160    }
3161
3162    // ── abs_path / rel_path ──────────────────────────────────────────────────
3163
3164    #[test]
3165    fn abs_and_rel_path_roundtrip() {
3166        let dir = empty_store();
3167        let store = open(&dir);
3168        let rel = Path::new("records/contacts/sarah.md");
3169        let abs = store.abs_path(rel);
3170        assert_eq!(abs, dir.path().join(rel));
3171        assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
3172
3173        // An absolute path is passed through unchanged by abs_path.
3174        assert_eq!(store.abs_path(&abs), abs);
3175
3176        // A path outside the store has no store-relative form.
3177        assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
3178    }
3179
3180    // ── infer_type_from_path (inverse of default_type_folder) ────────────────
3181
3182    #[test]
3183    fn infer_type_maps_every_recognized_folder_back_to_its_type() {
3184        let cases = [
3185            ("sources/emails/x.md", "email"),
3186            ("sources/transcripts/x.md", "transcript"),
3187            ("sources/docs/x.md", "pdf-source"),
3188            ("sources/notes/x.md", "note"),
3189            ("records/contacts/x.md", "contact"),
3190            ("records/companies/x.md", "company"),
3191            ("records/expenses/x.md", "expense"),
3192            ("records/meetings/x.md", "meeting"),
3193            ("records/decisions/x.md", "decision"),
3194            ("records/invoices/x.md", "invoice"),
3195        ];
3196        for (path, expected) in cases {
3197            assert_eq!(
3198                infer_type_from_path(Path::new(path)).as_deref(),
3199                Some(expected),
3200                "path {path} should infer type {expected}"
3201            );
3202        }
3203    }
3204
3205    #[test]
3206    fn infer_type_round_trips_with_default_type_folder() {
3207        // The canonical invariant: inference is the inverse of the forward map.
3208        // Every recognized type, routed through `default_type_folder` and then
3209        // back through `infer_type_from_path`, must return the original type.
3210        let recognized = [
3211            "email",
3212            "transcript",
3213            "pdf-source",
3214            "contact",
3215            "company",
3216            "expense",
3217            "meeting",
3218            "decision",
3219            "invoice",
3220        ];
3221        for type_ in recognized {
3222            let folder = default_type_folder(type_);
3223            let file = folder.join("x.md");
3224            assert_eq!(
3225                infer_type_from_path(&file).as_deref(),
3226                Some(type_),
3227                "recognized type {type_} (folder {folder:?}) must round-trip"
3228            );
3229        }
3230    }
3231
3232    #[test]
3233    fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
3234        // Regression guard for the CLI/core divergence: `default_type_folder`'s
3235        // unrecognized fallback is the BARE type name (`task → records/task`,
3236        // `tasks → records/tasks`). Inference must NOT singularize, or a custom
3237        // type would not round-trip (e.g. `records/tasks` → `task` would clash
3238        // with `default_type_folder("task") → records/task`).
3239        for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
3240            let folder = default_type_folder(custom);
3241            assert_eq!(folder, PathBuf::from("records").join(custom));
3242            let file = folder.join("x.md");
3243            assert_eq!(
3244                infer_type_from_path(&file).as_deref(),
3245                Some(custom),
3246                "custom type {custom} must round-trip verbatim (no singularization)"
3247            );
3248        }
3249
3250        // The specific case named in the finding: a plural custom folder keeps
3251        // its trailing `s`; it is NOT singularized to `task`.
3252        assert_eq!(
3253            infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
3254            Some("tasks"),
3255            "records/tasks must infer `tasks`, not `task`"
3256        );
3257    }
3258
3259    #[test]
3260    fn infer_type_requires_three_component_layer_folder_file_shape() {
3261        // Fewer than 3 components: a file directly under a layer has no
3262        // type-folder, so inference yields None (matches the old CLI contract).
3263        assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
3264        assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
3265        assert_eq!(infer_type_from_path(Path::new("x.md")), None);
3266        // Unknown leading layer is never inferred.
3267        assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
3268        // Deeper paths still infer from the first type-folder segment (e.g. a
3269        // sharded record under records/expenses/2026/05/x.md).
3270        assert_eq!(
3271            infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
3272            Some("expense"),
3273        );
3274    }
3275
3276    // ── ensure_path_within_store (containment) ───────────────────────────────
3277
3278    #[test]
3279    fn ensure_path_within_store_accepts_in_store_and_rejects_escape() {
3280        let dir = tempdir().unwrap();
3281        let root = dir.path();
3282        fs::create_dir_all(root.join("records/contacts")).unwrap();
3283        fs::write(root.join("records/contacts/sarah.md"), "x").unwrap();
3284
3285        // An existing in-store file resolves and is accepted.
3286        let inside = root.join("records/contacts/sarah.md");
3287        let got = ensure_path_within_store(root, &inside).expect("in-store path accepted");
3288        // Canonical, but still under the (canonical) root.
3289        assert!(got.starts_with(root.canonicalize().unwrap()));
3290
3291        // A not-yet-existing in-store leaf is accepted (rename destination).
3292        let new_leaf = root.join("records/contacts/sarah-chen.md");
3293        assert!(
3294            ensure_path_within_store(root, &new_leaf).is_ok(),
3295            "a non-existent in-store leaf must be accepted"
3296        );
3297
3298        // A `..`-escaping path is rejected even though its prefix exists.
3299        let escape = root.join("records/contacts/../../outside/secret.md");
3300        assert!(
3301            ensure_path_within_store(root, &escape).is_err(),
3302            "a `..`-escaping path must be rejected"
3303        );
3304    }
3305
3306    #[test]
3307    fn ensure_path_within_store_rejects_symlink_escape() {
3308        let dir = tempdir().unwrap();
3309        let root = dir.path().join("store");
3310        fs::create_dir_all(&root).unwrap();
3311        let outside_dir = dir.path().join("outside");
3312        fs::create_dir_all(&outside_dir).unwrap();
3313        let secret = outside_dir.join("secret.md");
3314        fs::write(&secret, "TOPSECRET").unwrap();
3315
3316        // A symlink inside the store that points OUTSIDE it must be rejected:
3317        // resolving the symlink lands outside the canonical root.
3318        #[cfg(unix)]
3319        {
3320            use std::os::unix::fs::symlink;
3321            let link = root.join("escape.md");
3322            symlink(&secret, &link).unwrap();
3323            assert!(
3324                ensure_path_within_store(&root, &link).is_err(),
3325                "a symlink resolving outside the store must be rejected"
3326            );
3327        }
3328    }
3329
3330    /// The amortized gate accepts and rejects exactly what the single-shot
3331    /// gate does — same resolved paths, same failures — across every candidate
3332    /// class: existing file (fast path), second file in the same folder
3333    /// (memoized parent), missing leaf (slow-path peel), `..` tail, symlink
3334    /// leaf escaping the store, and a symlinked PARENT dir escaping the store.
3335    #[test]
3336    fn store_containment_matches_single_shot_gate() {
3337        let dir = tempdir().unwrap();
3338        let root = dir.path().join("store");
3339        fs::create_dir_all(root.join("records/contacts")).unwrap();
3340        fs::write(root.join("records/contacts/sarah.md"), "x").unwrap();
3341        fs::write(root.join("records/contacts/jules.md"), "y").unwrap();
3342        let outside_dir = dir.path().join("outside");
3343        fs::create_dir_all(&outside_dir).unwrap();
3344        fs::write(outside_dir.join("secret.md"), "TOPSECRET").unwrap();
3345
3346        let mut gate = StoreContainment::new(&root).expect("root canonicalizes");
3347        let same = |cand: &Path, label: &str, gate: &mut StoreContainment| {
3348            let single = ensure_path_within_store(&root, cand);
3349            let amortized = gate.resolve(cand);
3350            match (single, amortized) {
3351                (Ok(a), Ok(b)) => assert_eq!(a, b, "{label}: resolved paths differ"),
3352                (Err(_), Err(_)) => {}
3353                (s, a) => panic!("{label}: verdicts differ — single-shot {s:?} vs amortized {a:?}"),
3354            }
3355        };
3356
3357        same(
3358            &root.join("records/contacts/sarah.md"),
3359            "existing file",
3360            &mut gate,
3361        );
3362        same(
3363            &root.join("records/contacts/jules.md"),
3364            "memoized parent",
3365            &mut gate,
3366        );
3367        same(
3368            &root.join("records/contacts/new-leaf.md"),
3369            "missing leaf",
3370            &mut gate,
3371        );
3372        same(
3373            &root.join("records/contacts/../../outside/secret.md"),
3374            "`..` tail",
3375            &mut gate,
3376        );
3377
3378        #[cfg(unix)]
3379        {
3380            use std::os::unix::fs::symlink;
3381            // Symlink LEAF out of the store: slow path, rejected by both.
3382            let link = root.join("records/contacts/escape.md");
3383            symlink(outside_dir.join("secret.md"), &link).unwrap();
3384            same(&link, "symlink leaf escape", &mut gate);
3385            assert!(
3386                gate.resolve(&link).is_err(),
3387                "symlink leaf must be rejected"
3388            );
3389
3390            // Symlinked PARENT dir out of the store: the fast path's parent
3391            // canonicalize resolves it outside the root — rejected by both.
3392            let linked_dir = root.join("records/linked");
3393            symlink(&outside_dir, &linked_dir).unwrap();
3394            let through = linked_dir.join("secret.md");
3395            same(&through, "symlinked parent escape", &mut gate);
3396            assert!(
3397                gate.resolve(&through).is_err(),
3398                "a candidate under a symlinked-out parent must be rejected"
3399            );
3400        }
3401    }
3402
3403    // ── shared link-edge notion (fence / whitespace / case) ──────────────────
3404
3405    #[test]
3406    fn extract_edge_targets_trims_inner_whitespace() {
3407        // Padded `[[ x ]]` is the same edge as `[[x]]`.
3408        assert_eq!(
3409            extract_edge_targets("See [[ records/contacts/sarah ]] today."),
3410            vec!["records/contacts/sarah".to_string()]
3411        );
3412    }
3413
3414    #[test]
3415    fn extract_edge_targets_skips_fenced_code_blocks() {
3416        // A `[[...]]` inside a ``` fence is a doc example, NOT an edge — matching
3417        // validate's body extractor.
3418        let body = "\
3419Real [[records/contacts/sarah]] link.
3420
3421```markdown
3422[[records/contacts/ghost-example]] is how you link.
3423```
3424
3425After fence [[records/companies/acme]].
3426";
3427        let got = extract_edge_targets(body);
3428        assert_eq!(
3429            got,
3430            vec![
3431                "records/contacts/sarah".to_string(),
3432                "records/companies/acme".to_string(),
3433            ],
3434            "fenced example link must not be an edge"
3435        );
3436    }
3437
3438    #[test]
3439    fn extract_edge_targets_frontmatter_fence_does_not_swallow_body_links() {
3440        // Regression: `search_by_link` / `forwardlinks` / `dbmd graph backlinks` feed the
3441        // WHOLE file (frontmatter + body) here. A stray code-fence run inside a
3442        // frontmatter value must NOT open a markdown fence that swallows the
3443        // body's real wiki-links. Frontmatter links are still edges; a link
3444        // genuinely inside a BODY fence is still ignored.
3445        let file = "\
3446---
3447type: note
3448summary: \"a note\"
3449ref: \"[[records/contacts/sarah]]\"
3450snippet: \"```\"
3451---
3452
3453Body mentions [[records/companies/acme]].
3454
3455```
3456[[records/contacts/ghost-example]] inside a body fence.
3457```
3458
3459After fence [[records/contacts/dave]].
3460";
3461        let got = extract_edge_targets(file);
3462        assert_eq!(
3463            got,
3464            vec![
3465                "records/contacts/sarah".to_string(), // frontmatter edge
3466                "records/companies/acme".to_string(), // body edge AFTER the frontmatter ```
3467                "records/contacts/dave".to_string(),  // body edge after a real body fence
3468            ],
3469            "a code fence inside frontmatter must not suppress body wiki-links, \
3470             and a real body-fenced link must still be ignored"
3471        );
3472    }
3473
3474    #[test]
3475    fn extract_edge_targets_handles_nested_indented_and_long_run_fences() {
3476        // Regression for the naive `starts_with("```")/("~~~")` toggle: a fence
3477        // nested inside another, an over-indented (>3 space) marker, and a
3478        // long-run fence wrapping a shorter inner one must all leave the block's
3479        // links un-extracted (validate treats the whole block as opaque). The
3480        // (char, run-length) tracker keys on the OPENING fence and closes only on
3481        // a matching char with run ≥ the opener.
3482
3483        // (a) A ```` ```` ````-run block (run 4) wrapping a ``` example (run 3).
3484        // The inner ``` does NOT close the outer run-4 fence, so both `[[...]]`
3485        // inside stay fenced.
3486        let nested = "\
3487Doc:
3488
3489````
3490```
3491[[records/contacts/bob]]
3492```
3493still fenced [[records/contacts/bob]]
3494````
3495
3496Real [[records/companies/acme]].
3497";
3498        assert_eq!(
3499            extract_edge_targets(nested),
3500            vec!["records/companies/acme".to_string()],
3501            "a nested ``` inside a ````-run fence must not leak the fenced links"
3502        );
3503
3504        // (b) A `~~~` block containing a ``` line (the standard way to document a
3505        // backtick fence). The inner backtick line must not flip the state.
3506        let tilde_wraps_backtick = "\
3507~~~
3508```
3509[[records/contacts/ghost]]
3510```
3511~~~
3512
3513After [[records/companies/acme]].
3514";
3515        assert_eq!(
3516            extract_edge_targets(tilde_wraps_backtick),
3517            vec!["records/companies/acme".to_string()],
3518            "a ``` line inside a ~~~ block must not invert the fence state"
3519        );
3520
3521        // (c) An over-indented ```` ``` ```` (4 spaces) is NOT a fence; the link
3522        // on the next line is live.
3523        let over_indented = "    ```\nLive [[records/contacts/sarah]].\n";
3524        assert_eq!(
3525            extract_edge_targets(over_indented),
3526            vec!["records/contacts/sarah".to_string()],
3527            "a >3-space-indented ``` is not a fence opener"
3528        );
3529    }
3530
3531    #[test]
3532    fn canonical_link_target_strips_md_dotslash_and_trims() {
3533        assert_eq!(canonical_link_target("  records/x.md  "), "records/x");
3534        assert_eq!(canonical_link_target("./records/y"), "records/y");
3535        assert_eq!(canonical_link_target("/records/z"), "records/z");
3536    }
3537
3538    #[test]
3539    fn link_edge_key_folds_case_only_on_case_insensitive_fs() {
3540        let a = link_edge_key("records/contacts/Sarah-Chen");
3541        let b = link_edge_key("records/contacts/sarah-chen");
3542        if fs_is_case_insensitive() {
3543            assert_eq!(a, b, "case-insensitive FS must fold the key");
3544        } else {
3545            assert_ne!(a, b, "case-sensitive FS must keep the key case-exact");
3546        }
3547    }
3548
3549    #[test]
3550    fn link_edge_key_unifies_nfc_and_nfd_normalization_forms() {
3551        // REGRESSION (Unicode encoding / silent graph break): on macOS/APFS a
3552        // file written in one Unicode normalization form and a link written in
3553        // the other name the SAME file (the FS folds NFC/NFD), but their raw
3554        // bytes differ. The edge comparison key must fold them to one key on
3555        // every platform, or the graph (backlinks/forwardlinks/orphans) keys the
3556        // two as different targets and silently misses the edge.
3557        let nfc = "records/contacts/jos\u{00e9}"; // é = U+00E9 (NFC)
3558        let nfd = "records/contacts/jose\u{0301}"; // e + U+0301 (NFD)
3559                                                   // The two inputs are genuinely byte-different (the test would be vacuous
3560                                                   // otherwise).
3561        assert_ne!(nfc, nfd, "test inputs must be byte-distinct NFC vs NFD");
3562        assert_eq!(
3563            link_edge_key(nfc),
3564            link_edge_key(nfd),
3565            "NFC and NFD spellings of the same name must produce one edge key"
3566        );
3567    }
3568
3569    // ── walk follows symlinked content ───────────────────────────────────────
3570
3571    #[cfg(unix)]
3572    #[test]
3573    fn walk_includes_symlinked_content_file_and_symlinked_folder() {
3574        use std::os::unix::fs::symlink;
3575        let dir = empty_store();
3576        let root = dir.path();
3577        // A regular file (control).
3578        write(
3579            root,
3580            "records/contacts/sarah.md",
3581            &content_md("2026-05-01T00:00:00Z"),
3582        );
3583        // A symlinked .md content file inside a real folder.
3584        let external_file = root.join("external-elena.md");
3585        fs::write(&external_file, content_md("2026-05-02T00:00:00Z")).unwrap();
3586        symlink(&external_file, root.join("records/contacts/elena.md")).unwrap();
3587        // A symlinked type folder.
3588        let external_dir = dir.path().join("external-companies");
3589        fs::create_dir_all(&external_dir).unwrap();
3590        fs::write(
3591            external_dir.join("acme.md"),
3592            content_md("2026-05-03T00:00:00Z"),
3593        )
3594        .unwrap();
3595        symlink(&external_dir, root.join("records/companies")).unwrap();
3596
3597        let store = open(&dir);
3598        let got = rels(&store.walk().unwrap());
3599        assert!(
3600            got.contains(&"records/contacts/elena.md".to_string()),
3601            "a symlinked content file must be walked: {got:?}"
3602        );
3603        assert!(
3604            got.contains(&"records/companies/acme.md".to_string()),
3605            "a file inside a symlinked type folder must be walked: {got:?}"
3606        );
3607    }
3608
3609    // ── find_links_to: padded / fenced / case ────────────────────────────────
3610
3611    #[test]
3612    fn find_links_to_matches_whitespace_padded_link() {
3613        let dir = empty_store();
3614        let root = dir.path();
3615        write(
3616            root,
3617            "records/profiles/a.md",
3618            "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\nSee [[ records/contacts/sarah ]] today.\n",
3619        );
3620        let store = open(&dir);
3621        let got = rels(
3622            &store
3623                .find_links_to(Path::new("records/contacts/sarah"))
3624                .unwrap(),
3625        );
3626        assert_eq!(
3627            got,
3628            vec!["records/profiles/a.md".to_string()],
3629            "a padded `[[ x ]]` link must be found as a backward edge, matching forwardlinks"
3630        );
3631    }
3632
3633    #[test]
3634    fn find_links_to_ignores_fenced_example_link() {
3635        let dir = empty_store();
3636        let root = dir.path();
3637        write(
3638            root,
3639            "records/concepts/howto.md",
3640            "---\ntype: concept\nmeta-type: conclusion\nsummary: s\n---\n```markdown\n[[records/contacts/sarah]]\n```\n",
3641        );
3642        let store = open(&dir);
3643        let got = store
3644            .find_links_to(Path::new("records/contacts/sarah"))
3645            .unwrap();
3646        assert!(
3647            got.is_empty(),
3648            "a `[[...]]` only inside a fenced code block is not a backward edge: {got:?}"
3649        );
3650    }
3651
3652    #[cfg(unix)]
3653    #[test]
3654    fn find_links_to_matches_case_variant_on_case_insensitive_fs() {
3655        // Only meaningful on a case-insensitive filesystem; on a case-sensitive
3656        // one the case-variant link is genuinely a different target.
3657        if !fs_is_case_insensitive() {
3658            return;
3659        }
3660        let dir = empty_store();
3661        let root = dir.path();
3662        write(
3663            root,
3664            "records/profiles/bio.md",
3665            "---\ntype: profile\nmeta-type: conclusion\nsummary: s\n---\nSee [[records/contacts/Sarah-Chen]].\n",
3666        );
3667        let store = open(&dir);
3668        let got = rels(
3669            &store
3670                .find_links_to(Path::new("records/contacts/sarah-chen"))
3671                .unwrap(),
3672        );
3673        assert_eq!(
3674            got,
3675            vec!["records/profiles/bio.md".to_string()],
3676            "a case-variant link must be found on a case-insensitive filesystem"
3677        );
3678    }
3679}