Skip to main content

dbmd_core/
store.rs

1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (embedded ripgrep,
12//! presence-only) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18
19use std::collections::BTreeMap;
20use std::path::{Path, PathBuf};
21
22use chrono::{DateTime, Datelike, FixedOffset};
23use grep::regex::RegexMatcher;
24use grep::searcher::sinks::UTF8;
25use grep::searcher::Searcher;
26use ignore::WalkBuilder;
27
28use crate::index::IndexRecord;
29use crate::parser::{parse_db_md, Config, Frontmatter};
30
31/// Basenames that are never content files: the config marker and the two
32/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
33/// content layers never mistakes a catalog for a record.
34///
35/// Only `index.md` is excluded by basename, because the content walks traverse
36/// the layer dirs (`sources/`/`records/`/`wiki/`) and `index.md` is the only
37/// meta file that appears INSIDE them. The root `DB.md` / `log.md` (and the
38/// `log/` archive) live at the store root, outside every layer, so they are
39/// never reached by these walks — and a content file that merely happens to be
40/// named `DB.md` or `log.md` inside a layer (e.g. `records/docs/DB.md`) is real
41/// content the SPEC does NOT reserve at type-folder depth.
42const NON_CONTENT_BASENAMES: [&str; 1] = ["index.md"];
43
44/// The complete machine-twin sidecar that backs every structured read.
45const TYPE_INDEX_FILE: &str = "index.jsonl";
46
47/// Returned when a path is opened as a store but has no `DB.md` at its root.
48/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
49#[derive(Debug, thiserror::Error)]
50#[error("not a db.md store: {path} has no DB.md")]
51pub struct NotAStore {
52    /// The path that was inspected.
53    pub path: PathBuf,
54}
55
56/// Errors from store-level operations (walk, locate, shard, sidecar read).
57#[derive(Debug, thiserror::Error)]
58pub enum StoreError {
59    /// A sidecar `index.jsonl` could not be read or parsed.
60    #[error("failed to read type index {path}: {message}")]
61    BadTypeIndex {
62        /// The sidecar file.
63        path: PathBuf,
64        /// What went wrong.
65        message: String,
66    },
67
68    /// A required date field for sharding was absent or unparseable, and there
69    /// was no usable fallback.
70    #[error("cannot compute shard path for {file}: no usable date field")]
71    NoShardDate {
72        /// The file being placed.
73        file: PathBuf,
74    },
75
76    /// An embedded-ripgrep scan failed to start or run.
77    #[error("search failed under {root}: {message}")]
78    Search {
79        /// The root the scan ran under.
80        root: PathBuf,
81        /// What went wrong.
82        message: String,
83    },
84
85    /// An underlying I/O failure.
86    #[error(transparent)]
87    Io(#[from] std::io::Error),
88}
89
90/// The three canonical layers of a db.md store.
91///
92/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
93/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
94/// declaration order (`Sources` < `Records` < `Wiki`) is the sort order.
95#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
96pub enum Layer {
97    /// `sources/` — raw evidence; immutable; date-sharded at scale.
98    Sources,
99    /// `records/` — atomic typed data; entity types flat, event types sharded.
100    Records,
101    /// `wiki/` — curator-synthesized narrative; flat.
102    Wiki,
103}
104
105impl Layer {
106    /// The on-disk folder name for this layer (`"sources"` / `"records"` /
107    /// `"wiki"`).
108    pub fn dir_name(self) -> &'static str {
109        match self {
110            Layer::Sources => "sources",
111            Layer::Records => "records",
112            Layer::Wiki => "wiki",
113        }
114    }
115
116    /// Parse a layer from its folder name; `None` for anything else.
117    pub fn from_dir_name(name: &str) -> Option<Self> {
118        match name {
119            "sources" => Some(Layer::Sources),
120            "records" => Some(Layer::Records),
121            "wiki" => Some(Layer::Wiki),
122            _ => None,
123        }
124    }
125
126    /// Every layer, in canonical order.
127    pub fn all() -> [Layer; 3] {
128        [Layer::Sources, Layer::Records, Layer::Wiki]
129    }
130}
131
132/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
133///
134/// Construct via [`Store::open`]; that is the only path in, and it validates
135/// the `DB.md` marker so downstream code can assume a real store.
136#[derive(Debug, Clone)]
137pub struct Store {
138    /// The store root (the directory containing `DB.md`).
139    pub root: PathBuf,
140    /// The parsed `DB.md` config (agent instructions, policies, schemas).
141    pub config: Config,
142}
143
144impl Store {
145    /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
146    /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
147    /// count (the lowercase name refers to the project/spec, not the marker).
148    pub fn is_db_md_store(path: &Path) -> bool {
149        // Read the directory and match the *stored* filename byte-for-byte.
150        // `path.join("DB.md").exists()` would lie on a case-insensitive
151        // filesystem (macOS default), where a lowercase `db.md` answers a
152        // `DB.md` probe. `read_dir` returns the real on-disk name, so the
153        // exact-match check is correct on both case-sensitive (Linux) and
154        // case-insensitive filesystems.
155        let entries = match std::fs::read_dir(path) {
156            Ok(entries) => entries,
157            Err(_) => return false,
158        };
159        for entry in entries.flatten() {
160            if entry.file_name() == "DB.md" {
161                // A directory literally named `DB.md` is not the marker.
162                match entry.file_type() {
163                    Ok(ft) if ft.is_dir() => return false,
164                    Ok(_) => return true,
165                    Err(_) => return false,
166                }
167            }
168        }
169        false
170    }
171
172    /// Open `path` as a db.md store and require `DB.md` to be readable and
173    /// parseable. Normal commands should enter through this strict gate so a
174    /// damaged config cannot silently disable schema or policy rules.
175    pub fn open_strict(path: &Path) -> crate::Result<Store> {
176        if !Store::is_db_md_store(path) {
177            return Err(NotAStore {
178                path: path.to_path_buf(),
179            }
180            .into());
181        }
182        let db_md = path.join("DB.md");
183        let text = std::fs::read_to_string(&db_md)?;
184        let config = parse_db_md(&text, &db_md)?;
185        Ok(Store {
186            root: path.to_path_buf(),
187            config,
188        })
189    }
190
191    /// Open `path` as a db.md store: confirm the `DB.md` marker (else
192    /// [`NotAStore`]) and parse the `DB.md` config when possible. This is the
193    /// lenient validation-oriented open path: a damaged `DB.md` still marks the
194    /// directory as a store so `dbmd validate` can report the config error as an
195    /// issue. Normal CLI commands should use [`Store::open_strict`] instead.
196    pub fn open(path: &Path) -> Result<Store, NotAStore> {
197        if !Store::is_db_md_store(path) {
198            return Err(NotAStore {
199                path: path.to_path_buf(),
200            });
201        }
202        let db_md = path.join("DB.md");
203        // The marker exists; parse its config. A read or parse failure leaves
204        // the store openable with default config rather than masquerading as
205        // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
206        // DB.md is `dbmd validate`'s job to report, not `open`'s.
207        let config = match std::fs::read_to_string(&db_md) {
208            Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
209            Err(_) => Config::default(),
210        };
211        Ok(Store {
212            root: path.to_path_buf(),
213            config,
214        })
215    }
216
217    /// **SWEEP.** Recursively iterate every `.md` content file across
218    /// `sources/`, `records/`, and `wiki/`, skipping hidden dirs and `log/`.
219    /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
220    /// the interactive loop.
221    pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
222        // Only the three content layers — never root meta files (`DB.md`,
223        // `index.md`, `log.md`) and never `log/`, which live at root and are
224        // outside every layer dir.
225        let mut out = Vec::new();
226        for layer in Layer::all() {
227            out.extend(self.walk_layer(layer)?);
228        }
229        out.sort();
230        Ok(out)
231    }
232
233    /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
234    pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
235        let layer_root = self.root.join(layer.dir_name());
236        if !layer_root.is_dir() {
237            return Ok(Vec::new());
238        }
239        self.walk_content_md(&layer_root)
240    }
241
242    /// Enumerate every `.md` file in a single type-folder, **recursing through
243    /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
244    /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
245    pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
246        let abs = self.resolve_under_root(type_folder);
247        if !abs.is_dir() {
248            return Ok(Vec::new());
249        }
250        self.walk_content_md(&abs)
251    }
252
253    /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
254    /// (descending), ties broken by store-relative path (ascending) — a total
255    /// order, so write-through and rebuild never disagree on #500 vs #501.
256    ///
257    /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
258    /// `index rebuild`. The write-through path never calls this. The
259    /// cap-selection primitive for the 500-entry `index.md` browse view.
260    pub fn recent_in_type_folder(
261        &self,
262        type_folder: &Path,
263        n: usize,
264    ) -> Result<Vec<PathBuf>, StoreError> {
265        let files = self.walk_type_folder(type_folder)?;
266        // (updated, rel-path) for each file. Files missing/unparseable
267        // `updated` sort *after* dated ones (None last), then by path — so they
268        // are deterministically the lowest-priority candidates for the cap, not
269        // dropped silently. The total order (updated desc, path asc) is what
270        // keeps write-through and rebuild agreeing on #500 vs #501.
271        let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
272            .into_iter()
273            .map(|rel| {
274                let updated = self.read_updated(&self.abs_path(&rel));
275                (updated, rel)
276            })
277            .collect();
278        keyed.sort_by(|a, b| {
279            // `updated` descending: newest first. `None` is treated as the
280            // oldest possible, so dated files always win a cap slot over
281            // undated ones.
282            let by_updated = b.0.cmp(&a.0);
283            by_updated.then_with(|| a.1.cmp(&b.1))
284        });
285        keyed.truncate(n);
286        Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
287    }
288
289    /// The shard/flat predicate: true if the type date-shards, false if it
290    /// stays flat. True for source types and event record types
291    /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
292    /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
293    /// dedup-bounded entity types (`contact`/`company`/`decision`) and `wiki/`.
294    pub fn type_shards(&self, type_: &str) -> bool {
295        // A `DB.md ## Schemas` `### <type>` block with a `shard:` directive is
296        // authoritative — it is the v0.2 generic-model way to declare sharding,
297        // so it overrides the built-in default below (in either direction).
298        if let Some(shard) = self.config.schemas.get(type_).and_then(|s| s.shard) {
299            return shard;
300        }
301        // Built-in default for the example types. Sharding is a property of the
302        // *type*:
303        //  - source types carry a primary date field and shard;
304        //  - event record types track business volume and shard;
305        //  - dedup-bounded entity types and curation-bounded wiki stay flat.
306        // Any type can override this via a `shard:` directive (above).
307        matches!(
308            type_,
309            // source types
310            "email" | "transcript" | "pdf-source"
311            // event record types (canonical)
312            | "expense" | "invoice" | "meeting"
313            // event record types (recognized custom, per the plan)
314            | "order" | "ticket" | "transaction"
315        )
316    }
317
318    /// Compute the canonical write path for a new file. For a sharding type
319    /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
320    /// primary date field (`email.date`, `expense.date`, … fallback `created`)
321    /// under the type folder; flat types and `wiki/` get no shard segment.
322    /// Deterministic + stable: same input → same path, so a record never moves
323    /// once written.
324    pub fn shard_path_for(
325        &self,
326        type_: &str,
327        frontmatter: &Frontmatter,
328        name: &str,
329    ) -> Result<PathBuf, StoreError> {
330        self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
331    }
332
333    /// Like [`Store::shard_path_for`], but compute the path under an explicit,
334    /// caller-resolved type-folder rather than the canonical default. This lets a
335    /// write surface honour an agent-supplied conforming sub-folder — e.g.
336    /// `wiki/projects/`, `wiki/people/`, `wiki/synthesis/` (the SPEC files a
337    /// `wiki-page` under `wiki/<topic>/`, i.e. ANY topic sub-folder, not only the
338    /// `wiki/topics` default) — while still applying date-sharding for sharding
339    /// types. The folder must be a conforming `<layer>/<type-folder>` (2
340    /// components, recognized layer); the caller is responsible for that (see the
341    /// CLI's `resolve_write_path`), so it is taken as given here.
342    ///
343    /// Sharding is still a property of the *type*: a sharding type gets the
344    /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
345    pub fn shard_path_in(
346        &self,
347        folder: &Path,
348        type_: &str,
349        frontmatter: &Frontmatter,
350        name: &str,
351    ) -> Result<PathBuf, StoreError> {
352        let folder = folder.to_path_buf();
353        let filename = ensure_md_extension(name);
354
355        if !self.type_shards(type_) {
356            // Flat type (entity records, wiki, decisions): no shard segment.
357            return Ok(folder.join(filename));
358        }
359
360        // Sharding type: derive <YYYY>/<MM> from the primary date field, with
361        // `created` as the universal fallback. Reading the public `Frontmatter`
362        // fields directly (typed `created`/`updated` + raw `extra`) avoids the
363        // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
364        let (year, month) = self
365            .primary_shard_segment(type_, frontmatter)
366            .ok_or_else(|| StoreError::NoShardDate {
367                file: folder.join(&filename),
368            })?;
369
370        Ok(folder.join(year).join(month).join(filename))
371    }
372
373    /// Find files with an incoming wiki-link to `target`, via **embedded
374    /// ripgrep** for `[[target]]` across all layers. Loop-fast; no whole-graph
375    /// build. Returns store-relative paths.
376    pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
377        // A single target is just the degenerate batch case — one alternation
378        // arm, one store scan. Routing through `find_links_to_any` keeps the
379        // pattern construction and the scan loop in exactly one place. The
380        // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
381        // single alloc on this single-target convenience path; the batch path
382        // validate.rs rides is untouched).
383        self.find_links_to_any(&[target.to_path_buf()])
384    }
385
386    /// Find every file with an incoming wiki-link to **any** of `targets`, in a
387    /// **single embedded-ripgrep pass** over the store (one `.md` walk, one
388    /// presence-only scan per file). This is the batch incoming-linker finder the
389    /// working-set [`crate::validate::validate_working_set`] sits on: it must find
390    /// the linkers for the *whole* changed set without paying a full store read
391    /// per changed object. Cost is therefore one store scan (O(store)), NOT
392    /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
393    /// in a loop would reread every `.md` once per target and is the exact
394    /// `O(changed × store)` blow-up this method exists to prevent. Returns
395    /// store-relative paths (deduped, sorted).
396    ///
397    /// Why content scan and not the sidecar `links` field: the sidecar projects
398    /// only the frontmatter `links:` array, so it misses edges written in the
399    /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
400    /// arbitrary path therefore requires reading file content — the same reason
401    /// the single-target finder uses ripgrep.
402    pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
403        // The wiki-link doctrine: a link is the full store-relative path, no
404        // `.md` extension. A reference to a target therefore appears literally
405        // as `[[<target>]]`, optionally with a `|display` suffix and (warned
406        // but accepted) a trailing `.md`. Build ONE regex that matches all
407        // accepted spellings of an incoming link to ANY target, escaping each
408        // target so path separators / dots stay literal and the alternation
409        // arms keep their boundaries (a link to `sarah` never matches
410        // `sarah-chen`).
411        let mut arms: Vec<String> = Vec::new();
412        for target in targets {
413            let target_str = path_to_link_str(target);
414            if target_str.is_empty() {
415                continue;
416            }
417            // [[ <target> (.md)? ( | display )? ]]
418            arms.push(format!(
419                r"\[\[{}(\.md)?(\|[^\]]*)?\]\]",
420                regex::escape(&target_str)
421            ));
422        }
423        // No usable targets → no possible incoming links, and an empty pattern
424        // would compile to a match-everything regex. Short-circuit instead.
425        if arms.is_empty() {
426            return Ok(Vec::new());
427        }
428        let pattern = arms.join("|");
429
430        let matcher = RegexMatcher::new(&pattern).map_err(|e| StoreError::Search {
431            root: self.root.clone(),
432            message: format!("invalid backlink pattern: {e}"),
433        })?;
434
435        let mut hits = std::collections::BTreeSet::new();
436        // Scan every `.md` file in the store (skip hidden + `log/`), including
437        // `index.md` catalogs — an incoming reference is wherever the literal
438        // link text lives; the caller decides relevance. ONE walk for the whole
439        // target set; per file we stop at the first hit (presence is all we
440        // need), so a file that links to several targets is read once, not once
441        // per target.
442        for rel in self.walk_all_md()? {
443            let abs = self.abs_path(&rel);
444            let mut matched_here = false;
445            let mut searcher = Searcher::new();
446            let res = searcher.search_path(
447                &matcher,
448                &abs,
449                UTF8(|_lnum, _line| {
450                    matched_here = true;
451                    // Stop at the first hit: presence is all we need.
452                    Ok(false)
453                }),
454            );
455            if let Err(e) = res {
456                return Err(StoreError::Search {
457                    root: self.root.clone(),
458                    message: format!("search failed in {}: {e}", abs.display()),
459                });
460            }
461            if matched_here {
462                hits.insert(rel);
463            }
464        }
465        Ok(hits.into_iter().collect())
466    }
467
468    /// Candidate set for a `type` query: read the relevant type-folder
469    /// `index.jsonl` sidecar(s) and return their records. Complete and
470    /// cold-cache-proof — NOT a walk-and-parse or a frontmatter ripgrep scan,
471    /// and **never a store-wide read**. The common path is one sequential read
472    /// of the canonical type-folder sidecar (O(entities)); when that sidecar is
473    /// absent the read is bounded to the type's single layer subtree
474    /// (O(entities-in-layer)), so a `--type proposal` query before that folder
475    /// has been indexed still stays inside the interactive loop's O(entities)
476    /// contract instead of fanning out across every sidecar in the store.
477    pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
478        // Read the type's canonical-folder sidecar when it exists (the common,
479        // O(entities) path). Otherwise fall back to the sidecars of the *one
480        // layer* the type belongs to and filter by `type` — complete for records
481        // filed under a non-canonical folder name within that layer (e.g. a
482        // custom `proposal` filed in `records/proposals/` when the canonical
483        // guess is the bare `records/proposal/`), without the whole-store
484        // sidecar fan-out that would break the interactive loop's O(entities)
485        // contract. A type lives in exactly one layer, and `default_type_folder`
486        // always encodes it (recognized → its SPEC layer; unrecognized →
487        // `records/`), so the fallback walk is bounded to that layer's subtree —
488        // O(entities-in-layer), never O(store). Either way: sequential, complete
489        // sidecar reads, never a walk-and-parse of the tree.
490        let canonical_folder = default_type_folder(type_);
491        let canonical = self.root.join(&canonical_folder).join(TYPE_INDEX_FILE);
492        let records = if canonical.is_file() {
493            self.read_type_index(&canonical)?
494        } else {
495            self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?
496        };
497        Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
498    }
499
500    /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
501    /// every type-folder `index.jsonl` sidecar and filter their records. The
502    /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
503    /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
504    pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
505        self.find_by_where_in(key, value, None)
506    }
507
508    /// Candidate set for a `key=value` frontmatter query, **scoped to one
509    /// layer** when `layer` is `Some`: the sidecar walk is confined to that
510    /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
511    /// not O(store records). `None` keeps the store-wide read.
512    ///
513    /// This is what makes `--in <layer>` an I/O scope, not just a result
514    /// filter: a `--where`-only query (no `--type`) used to read every sidecar
515    /// in the store and narrow by layer in memory, breaking the O(entities)
516    /// contract the interactive loop depends on. With a layer in hand we walk
517    /// only that layer's sidecars.
518    pub fn find_by_where_in(
519        &self,
520        key: &str,
521        value: &str,
522        layer: Option<Layer>,
523    ) -> Result<Vec<IndexRecord>, StoreError> {
524        // A `key=value` query can target any frontmatter field across any type,
525        // so within the chosen subtree we still read every type-folder sidecar
526        // and filter. The layer (when given) bounds *which* subtree, turning a
527        // whole-store walk into a single-layer walk.
528        let records = self.read_all_type_indexes_in(layer)?;
529        Ok(records
530            .into_iter()
531            .filter(|r| record_matches_field(r, key, value))
532            .collect())
533    }
534
535    /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
536    /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
537    /// else store-wide. Sequential, complete sidecar reads — never a
538    /// walk-and-parse of the content tree.
539    ///
540    /// This is the unfiltered sidecar-enumeration primitive the relationship
541    /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
542    /// candidate set to the relevant layer (or the whole store) without opening
543    /// the content tree, then confirms each candidate's edge by parsing the file.
544    pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
545        self.read_all_type_indexes_in(layer)
546    }
547
548    /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
549    /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
550    /// primitive every structured query sits on.
551    pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
552        let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
553            path: index_jsonl.to_path_buf(),
554            message: e.to_string(),
555        })?;
556
557        // Last-write-wins by `path` over un-compacted lines: a later line for
558        // the same path supersedes an earlier one (the jsonl is append-mostly
559        // and only compacted on rebuild). Blank lines are skipped; a non-blank
560        // line that is not a valid IndexRecord is a hard parse error.
561        let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
562        for (i, line) in text.lines().enumerate() {
563            let trimmed = line.trim();
564            if trimmed.is_empty() {
565                continue;
566            }
567            let record: IndexRecord =
568                serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
569                    path: index_jsonl.to_path_buf(),
570                    message: format!("line {}: {e}", i + 1),
571                })?;
572            by_path.insert(record.path.clone(), record);
573        }
574        // BTreeMap keyed by path → records emerge sorted by path ascending,
575        // a deterministic order independent of line order in the file.
576        Ok(by_path.into_values().collect())
577    }
578
579    /// Resolve a store-relative path to its absolute on-disk path under
580    /// [`root`](Store::root).
581    pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
582        // `Path::join` returns `store_relative` unchanged if it is already
583        // absolute, so passing an absolute path through is a no-op.
584        self.root.join(store_relative)
585    }
586
587    /// Convert an absolute path under the store into its store-relative form.
588    pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
589        abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
590    }
591
592    // ── Private helpers ─────────────────────────────────────────────────────
593
594    /// Resolve a caller-supplied folder path (store-relative or absolute) to an
595    /// absolute path under the store root.
596    fn resolve_under_root(&self, folder: &Path) -> PathBuf {
597        if folder.is_absolute() {
598            folder.to_path_buf()
599        } else {
600            self.root.join(folder)
601        }
602    }
603
604    /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
605    /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
606    /// and type-folder walks.
607    fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
608        let mut out = Vec::new();
609        for entry in self.md_walker(root).build() {
610            let entry = entry.map_err(|e| StoreError::Search {
611                root: root.to_path_buf(),
612                message: e.to_string(),
613            })?;
614            if !is_file_entry(&entry) {
615                continue;
616            }
617            let path = entry.path();
618            if !has_md_extension(path) {
619                continue;
620            }
621            if is_non_content_basename(path) {
622                continue;
623            }
624            if let Some(rel) = self.rel_path(path) {
625                out.push(rel);
626            }
627        }
628        out.sort();
629        Ok(out)
630    }
631
632    /// Walk the whole store for **every** `.md` file (including `index.md`),
633    /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
634    /// scan, where the literal link text can live in any markdown file.
635    fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
636        let mut out = Vec::new();
637        for entry in self.md_walker(&self.root).build() {
638            let entry = entry.map_err(|e| StoreError::Search {
639                root: self.root.clone(),
640                message: e.to_string(),
641            })?;
642            if !is_file_entry(&entry) {
643                continue;
644            }
645            let path = entry.path();
646            if !has_md_extension(path) {
647                continue;
648            }
649            if self.is_in_log_dir(path) {
650                continue;
651            }
652            if let Some(rel) = self.rel_path(path) {
653                out.push(rel);
654            }
655        }
656        out.sort();
657        Ok(out)
658    }
659
660    /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
661    /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
662    /// read with last-write-wins by path; across sidecars, paths are disjoint by
663    /// construction (one sidecar per folder), so a plain concatenation preserves
664    /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
665    /// is what keeps `find_by_where_in` O(entities-in-layer).
666    fn read_all_type_indexes_in(
667        &self,
668        layer: Option<Layer>,
669    ) -> Result<Vec<IndexRecord>, StoreError> {
670        let mut out = Vec::new();
671        for sidecar in self.find_type_index_files_in(layer)? {
672            out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
673        }
674        Ok(out)
675    }
676
677    /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
678    /// whole store (skip hidden + `log/`), returning store-relative paths. The
679    /// walk root is `<root>/<layer>/` for a scoped read and `self.root` for the
680    /// store-wide read; a non-existent layer subtree yields no sidecars rather
681    /// than walking a missing path.
682    fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
683        let walk_root = match layer {
684            Some(l) => self.root.join(l.dir_name()),
685            None => self.root.clone(),
686        };
687        // A scoped walk over a layer folder that does not exist yet must be an
688        // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
689        // error from `ignore` over a nonexistent path.
690        if !walk_root.is_dir() {
691            return Ok(Vec::new());
692        }
693        let mut out = Vec::new();
694        let mut builder = WalkBuilder::new(&walk_root);
695        builder.standard_filters(false).hidden(true);
696        for entry in builder.build() {
697            let entry = entry.map_err(|e| StoreError::Search {
698                root: walk_root.clone(),
699                message: e.to_string(),
700            })?;
701            if !is_file_entry(&entry) {
702                continue;
703            }
704            let path = entry.path();
705            if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
706                continue;
707            }
708            if self.is_in_log_dir(path) {
709                continue;
710            }
711            if let Some(rel) = self.rel_path(path) {
712                out.push(rel);
713            }
714        }
715        out.sort();
716        Ok(out)
717    }
718
719    /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
720    /// OFF (a SWEEP must see every file even if the store is a git repo with a
721    /// `.gitignore`), but hidden files/dirs are skipped.
722    fn md_walker(&self, root: &Path) -> WalkBuilder {
723        let mut builder = WalkBuilder::new(root);
724        builder.standard_filters(false).hidden(true);
725        builder
726    }
727
728    /// True if an absolute path lives under the store's root-level `log/`
729    /// rotation-archive directory.
730    fn is_in_log_dir(&self, abs: &Path) -> bool {
731        match self.rel_path(abs) {
732            Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
733            None => false,
734        }
735    }
736
737    /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
738    /// returning `None` when absent/unparseable. A self-contained reader (does
739    /// not depend on the not-yet-implemented `parser::read_file`); parses the
740    /// leading `---`-fenced YAML block with the same engine the parser uses.
741    fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
742        let text = std::fs::read_to_string(abs).ok()?;
743        let yaml = frontmatter_block(&text)?;
744        let value: serde_norway::Value = serde_norway::from_str(yaml).ok()?;
745        let raw = value.get("updated")?;
746        value_to_datetime(raw)
747    }
748
749    /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
750    /// date field with a `created` fallback. Reads the public `Frontmatter`
751    /// fields directly. `None` when no usable date is present.
752    fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
753        // Try the type's primary date field first.
754        if let Some(field) = primary_date_field(type_) {
755            if let Some(v) = fm.extra.get(field) {
756                if let Some(seg) = value_to_year_month(v) {
757                    return Some(seg);
758                }
759            }
760        }
761        // Universal fallback: the typed `created` timestamp.
762        fm.created
763            .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
764    }
765}
766
767// ── Free helpers (no `self`) ────────────────────────────────────────────────
768
769/// True if a walk entry is a regular file (not a dir / symlink-to-dir).
770fn is_file_entry(entry: &ignore::DirEntry) -> bool {
771    entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
772}
773
774/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
775/// lowercase `.md`).
776fn has_md_extension(path: &Path) -> bool {
777    path.extension().and_then(|e| e.to_str()) == Some("md")
778}
779
780/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
781/// `log.md`) that the content walks must skip.
782fn is_non_content_basename(path: &Path) -> bool {
783    match path.file_name().and_then(|n| n.to_str()) {
784        Some(name) => NON_CONTENT_BASENAMES.contains(&name),
785        None => false,
786    }
787}
788
789/// Append `.md` to a bare name; leave an existing `.md` untouched.
790fn ensure_md_extension(name: &str) -> String {
791    if name.ends_with(".md") {
792        name.to_string()
793    } else {
794        format!("{name}.md")
795    }
796}
797
798/// Render a store-relative path as a wiki-link target string with `/`
799/// separators (never `\`), no leading `./`, no trailing `.md`.
800fn path_to_link_str(target: &Path) -> String {
801    let mut parts: Vec<String> = Vec::new();
802    for comp in target.components() {
803        if let std::path::Component::Normal(os) = comp {
804            if let Some(s) = os.to_str() {
805                parts.push(s.to_string());
806            }
807        }
808    }
809    let mut joined = parts.join("/");
810    if let Some(stripped) = joined.strip_suffix(".md") {
811        joined = stripped.to_string();
812    }
813    joined
814}
815
816/// The canonical default folder for a recognized type, per the SPEC type table
817/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
818/// types fall back to `records/<type>` (the bare type name, no pluralization
819/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
820fn default_type_folder(type_: &str) -> PathBuf {
821    let path = match type_ {
822        // sources
823        "email" => "sources/emails",
824        "transcript" => "sources/transcripts",
825        "pdf-source" => "sources/docs",
826        // records — entities
827        "contact" => "records/contacts",
828        "company" => "records/companies",
829        // records — events
830        "expense" => "records/expenses",
831        "meeting" => "records/meetings",
832        "decision" => "records/decisions",
833        "invoice" => "records/invoices",
834        // wiki — the SPEC type table files a wiki-page under `wiki/<topic>/`,
835        // i.e. ALWAYS a sub-folder, never flat under `wiki/`. A 2-component
836        // `wiki/<file>` path is non-conforming: `index::type_folder_of` /
837        // `validate::type_folder_of` require `<layer>/<type-folder>/<file>` (3
838        // components), so a flat wiki page either crashes write-through
839        // (`on_write` tries to create `index.md` *inside* a file) or is silently
840        // dropped from every catalog by `rebuild_all`. `topic` is the page's
841        // canonical bucket; with only the bare type in hand here, `wiki/topics`
842        // is the deterministic default folder (matches the dogfood store).
843        "wiki-page" => "wiki/topics",
844        // unrecognized: bare type name under records/
845        other => return PathBuf::from("records").join(other),
846    };
847    PathBuf::from(path)
848}
849
850/// The canonical [`Layer`] a `type_` belongs to, derived from its default
851/// type-folder (`email` → `Sources`, `contact` → `Records`, `wiki-page` →
852/// `Wiki`, unrecognized → `Records`). The write path uses this to decide whether
853/// an agent-supplied folder is in the *right* layer for the type before honouring
854/// its sub-folder choice.
855pub fn layer_for_type(type_: &str) -> Layer {
856    layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
857}
858
859/// The [`Layer`] a type-folder path lives in, read from its first component
860/// (`sources/` → `Sources`, `records/` → `Records`, `wiki/` → `Wiki`). Used to
861/// bound [`Store::find_by_type`]'s canonical-folder-absent fallback to a single
862/// layer subtree. Returns `None` for a path with no recognized layer prefix;
863/// every value [`default_type_folder`] produces has one, so in practice this is
864/// always `Some` on the call path — `None` degrades to a store-wide read.
865fn layer_of_folder(folder: &Path) -> Option<Layer> {
866    let first = folder.components().next()?.as_os_str().to_str()?;
867    Layer::from_dir_name(first)
868}
869
870/// Infer a content file's canonical `type` from its store-relative path — the
871/// inverse of [`default_type_folder`] and the single source of truth for
872/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
873///
874/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
875/// shorter path (a file directly under a layer) or an unknown leading layer
876/// yields `None`.
877///
878/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
879/// unrecognized folder the fallback is the **bare folder name verbatim** (no
880/// pluralization/singularization) so it round-trips with `default_type_folder`,
881/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
882/// Singularizing here would break that round-trip (`records/tasks` → `task`
883/// while `default_type_folder("task")` → `records/task`). `wiki/<topic>` always
884/// infers `wiki-page`, since every wiki page is filed under a topic folder.
885pub fn infer_type_from_path(rel: &Path) -> Option<String> {
886    let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
887    let layer = comps.next()?;
888    if !matches!(layer, "sources" | "records" | "wiki") {
889        return None;
890    }
891    let folder = comps.next()?;
892    // The file itself must be a third component (a real type-folder, not the
893    // file sitting directly under the layer).
894    comps.next()?;
895
896    let mapped = match (layer, folder) {
897        ("sources", "emails") => "email",
898        ("sources", "transcripts") => "transcript",
899        ("sources", "docs") => "pdf-source",
900        ("records", "contacts") => "contact",
901        ("records", "companies") => "company",
902        ("records", "expenses") => "expense",
903        ("records", "meetings") => "meeting",
904        ("records", "decisions") => "decision",
905        ("records", "invoices") => "invoice",
906        // Every wiki page is filed under `wiki/<topic>/`; the type is always
907        // `wiki-page` regardless of the topic-folder name.
908        ("wiki", _) => "wiki-page",
909        // Unrecognized folder: the bare name, verbatim. This is the inverse of
910        // `default_type_folder`'s unrecognized fallback (`other → records/other`)
911        // and the round-trip would break if we pluralized/singularized here.
912        (_, other) => other,
913    };
914    Some(mapped.to_string())
915}
916
917/// The primary date field name for a sharding type (the field whose value
918/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
919fn primary_date_field(type_: &str) -> Option<&'static str> {
920    match type_ {
921        "email" => Some("date"),
922        "transcript" => Some("recorded_at"),
923        "pdf-source" => Some("received_at"),
924        "expense" | "invoice" | "meeting" => Some("date"),
925        // recognized custom event types have no canonical date field name; they
926        // fall back to `created`.
927        _ => None,
928    }
929}
930
931/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
932/// string and a YAML-native scalar rendered to string.
933fn value_to_datetime(value: &serde_norway::Value) -> Option<DateTime<FixedOffset>> {
934    let s = yaml_scalar_string(value)?;
935    DateTime::parse_from_rfc3339(s.trim()).ok()
936}
937
938/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
939/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
940/// `2026-05-22T10:00:00-07:00` timestamp both work.
941fn value_to_year_month(value: &serde_norway::Value) -> Option<(String, String)> {
942    let s = yaml_scalar_string(value)?;
943    year_month_from_str(s.trim())
944}
945
946/// `(YYYY, MM)` from the leading `YYYY-MM` of a date string.
947fn year_month_from_str(s: &str) -> Option<(String, String)> {
948    // Hand-roll the leading-`YYYY-MM` parse to avoid a regex compile on the
949    // write path. Require: 4 digits, '-', 2 digits.
950    let bytes = s.as_bytes();
951    if bytes.len() < 7 {
952        return None;
953    }
954    let is_digit = |b: u8| b.is_ascii_digit();
955    if !(is_digit(bytes[0])
956        && is_digit(bytes[1])
957        && is_digit(bytes[2])
958        && is_digit(bytes[3])
959        && bytes[4] == b'-'
960        && is_digit(bytes[5])
961        && is_digit(bytes[6]))
962    {
963        return None;
964    }
965    let month: u8 = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
966    if !(1..=12).contains(&month) {
967        return None;
968    }
969    Some((s[0..4].to_string(), s[5..7].to_string()))
970}
971
972/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
973/// value's compact YAML serialization (covers timestamps that the YAML engine
974/// may surface as a non-string scalar).
975fn yaml_scalar_string(value: &serde_norway::Value) -> Option<String> {
976    if let Some(s) = value.as_str() {
977        return Some(s.to_string());
978    }
979    match value {
980        serde_norway::Value::Null => None,
981        serde_norway::Value::Mapping(_) | serde_norway::Value::Sequence(_) => None,
982        other => serde_norway::to_string(other)
983            .ok()
984            .map(|s| s.trim().to_string()),
985    }
986}
987
988/// The YAML frontmatter block of a file: the text between a leading `---` fence
989/// and the next `---` fence, exclusive. `None` if the file does not open with a
990/// `---` fence on its first line.
991fn frontmatter_block(text: &str) -> Option<&str> {
992    // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
993    let body = text.strip_prefix('\u{feff}').unwrap_or(text);
994    let mut rest = body;
995    // First line must be exactly `---` (allowing trailing CR).
996    let (first, after_first) = split_first_line(rest);
997    if first.trim_end_matches('\r') != "---" {
998        return None;
999    }
1000    rest = after_first;
1001    let block_start = rest;
1002    let mut scanned = 0usize;
1003    loop {
1004        let (line, after) = split_first_line(rest);
1005        if line.trim_end_matches('\r') == "---" {
1006            return Some(&block_start[..scanned]);
1007        }
1008        if after.is_empty() && line.is_empty() {
1009            // Reached end of input without a closing fence.
1010            return None;
1011        }
1012        scanned += line.len() + 1; // +1 for the consumed '\n'
1013        if after.is_empty() {
1014            return None;
1015        }
1016        rest = after;
1017    }
1018}
1019
1020/// Split a string into (first line without its trailing `\n`, remainder after
1021/// the `\n`). If there is no newline, the whole string is the line and the
1022/// remainder is empty.
1023fn split_first_line(s: &str) -> (&str, &str) {
1024    match s.find('\n') {
1025        Some(i) => (&s[..i], &s[i + 1..]),
1026        None => (s, ""),
1027    }
1028}
1029
1030/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
1031/// typed columns first and then the flattened `fields` map.
1032fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
1033    match key {
1034        "type" => record.type_ == value,
1035        "summary" => record.summary == value,
1036        "path" => record.path.to_string_lossy() == value,
1037        "created" => timestamp_matches(record.created, value),
1038        "updated" => timestamp_matches(record.updated, value),
1039        "tags" => record.tags.iter().any(|t| t == value),
1040        "links" => record.links.iter().any(|l| l == value),
1041        other => record
1042            .fields
1043            .get(other)
1044            .map(|v| json_value_matches(v, value))
1045            .unwrap_or(false),
1046    }
1047}
1048
1049/// Compare a record's `created`/`updated` instant against a query `value`.
1050///
1051/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1052/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1053/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1054/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1055/// the `Z` form an agent reads straight out of the file would never match. We
1056/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1057/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1058/// can never equal a real timestamp, so it falls through to `false`.
1059fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1060    match (stored, DateTime::parse_from_rfc3339(value)) {
1061        (Some(stored), Ok(queried)) => stored == queried,
1062        _ => false,
1063    }
1064}
1065
1066/// Compare a JSON field value against a query string. A string matches
1067/// verbatim; scalars match their textual form; an array matches if any element
1068/// matches (so a list-valued frontmatter field is membership-queried).
1069fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1070    match v {
1071        serde_json::Value::String(s) => s == value,
1072        serde_json::Value::Bool(b) => b.to_string() == value,
1073        serde_json::Value::Number(n) => n.to_string() == value,
1074        serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1075        // A present-but-null field never matches — consistent with the in-memory
1076        // post-filter (`query::json_value_matches`, which the first `where`
1077        // clause is NOT re-checked against, so the two must agree here or a
1078        // `--where field=` query would return different rows than `--type X
1079        // --where field=`).
1080        serde_json::Value::Null => false,
1081        serde_json::Value::Object(_) => false,
1082    }
1083}
1084
1085#[cfg(test)]
1086mod tests {
1087    use super::*;
1088    use std::fs;
1089    use tempfile::{tempdir, TempDir};
1090
1091    // ── Fixtures ────────────────────────────────────────────────────────────
1092
1093    /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1094    /// store-relative path for convenient assertions.
1095    fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1096        let abs = root.join(rel);
1097        fs::create_dir_all(abs.parent().unwrap()).unwrap();
1098        fs::write(&abs, contents).unwrap();
1099        PathBuf::from(rel)
1100    }
1101
1102    /// A minimal content file with the given `updated` timestamp in frontmatter.
1103    fn content_md(updated: &str) -> String {
1104        format!(
1105            "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1106        )
1107    }
1108
1109    /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1110    /// real parser is exercised).
1111    fn empty_store() -> TempDir {
1112        let dir = tempdir().unwrap();
1113        fs::write(
1114            dir.path().join("DB.md"),
1115            "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1116        )
1117        .unwrap();
1118        dir
1119    }
1120
1121    /// Open a store rooted at a TempDir; panics if `open` rejects it.
1122    fn open(dir: &TempDir) -> Store {
1123        Store::open(dir.path()).expect("fixture should be a valid store")
1124    }
1125
1126    fn rels(paths: &[PathBuf]) -> Vec<String> {
1127        paths
1128            .iter()
1129            .map(|p| p.to_string_lossy().replace('\\', "/"))
1130            .collect()
1131    }
1132
1133    // ── Layer ───────────────────────────────────────────────────────────────
1134
1135    #[test]
1136    fn layer_dir_name_and_parse_are_inverse() {
1137        for layer in Layer::all() {
1138            assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1139        }
1140        assert_eq!(Layer::Sources.dir_name(), "sources");
1141        assert_eq!(Layer::Records.dir_name(), "records");
1142        assert_eq!(Layer::Wiki.dir_name(), "wiki");
1143        assert_eq!(Layer::from_dir_name("log"), None);
1144        assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1145    }
1146
1147    #[test]
1148    fn layer_order_is_canonical() {
1149        // stats keys a BTreeMap on Layer; the sort order must be sources<records<wiki.
1150        let mut v = [Layer::Wiki, Layer::Sources, Layer::Records];
1151        v.sort();
1152        assert_eq!(v, [Layer::Sources, Layer::Records, Layer::Wiki]);
1153    }
1154
1155    // ── is_db_md_store / open ────────────────────────────────────────────────
1156
1157    #[test]
1158    fn is_store_true_only_with_uppercase_marker() {
1159        let dir = tempdir().unwrap();
1160        assert!(
1161            !Store::is_db_md_store(dir.path()),
1162            "no marker → not a store"
1163        );
1164
1165        fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1166        assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1167    }
1168
1169    #[test]
1170    fn is_store_false_for_lowercase_db_md() {
1171        // The case-sensitivity contract: a lowercase db.md is the spec name, not
1172        // a marker — even on a case-insensitive filesystem where Path::exists
1173        // would lie. This test must pass on macOS (case-insensitive) too.
1174        let dir = tempdir().unwrap();
1175        fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1176        assert!(
1177            !Store::is_db_md_store(dir.path()),
1178            "lowercase db.md must NOT be treated as a store marker"
1179        );
1180        assert!(Store::open(dir.path()).is_err());
1181    }
1182
1183    #[test]
1184    fn is_store_false_when_db_md_is_a_directory() {
1185        let dir = tempdir().unwrap();
1186        fs::create_dir(dir.path().join("DB.md")).unwrap();
1187        assert!(
1188            !Store::is_db_md_store(dir.path()),
1189            "a directory named DB.md is not the file marker"
1190        );
1191    }
1192
1193    #[test]
1194    fn open_rejects_non_store_with_path() {
1195        let dir = tempdir().unwrap();
1196        let err = Store::open(dir.path()).unwrap_err();
1197        assert_eq!(err.path, dir.path());
1198    }
1199
1200    #[test]
1201    fn open_succeeds_and_parses_config() {
1202        let dir = tempdir().unwrap();
1203        // A DB.md whose ## Policies declares a frozen page — proves open()
1204        // actually parsed the config rather than substituting a default.
1205        fs::write(
1206            dir.path().join("DB.md"),
1207            "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1208             ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1209        )
1210        .unwrap();
1211        let store = Store::open(dir.path()).unwrap();
1212        assert_eq!(store.root, dir.path());
1213        assert!(
1214            store
1215                .config
1216                .frozen_pages
1217                .iter()
1218                .any(|p| p == Path::new("records/decisions/q1.md")),
1219            "open() must surface DB.md ## Policies, got {:?}",
1220            store.config.frozen_pages
1221        );
1222    }
1223
1224    // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1225
1226    #[test]
1227    fn walk_collects_content_across_layers_skipping_meta_and_log() {
1228        let dir = empty_store();
1229        let root = dir.path();
1230        write(
1231            root,
1232            "sources/emails/2026/05/a.md",
1233            &content_md("2026-05-01T00:00:00Z"),
1234        );
1235        write(
1236            root,
1237            "records/contacts/sarah.md",
1238            &content_md("2026-05-02T00:00:00Z"),
1239        );
1240        write(
1241            root,
1242            "wiki/people/sarah.md",
1243            &content_md("2026-05-03T00:00:00Z"),
1244        );
1245        // Things walk() must SKIP:
1246        write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1247        write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1248        write(root, "log.md", "---\ntype: log\n---\n"); // log
1249        write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1250        write(
1251            root,
1252            "sources/.hidden/secret.md",
1253            &content_md("2026-05-09T00:00:00Z"),
1254        ); // hidden dir
1255        write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1256
1257        let store = open(&dir);
1258        let got = rels(&store.walk().unwrap());
1259        assert_eq!(
1260            got,
1261            vec![
1262                "records/contacts/sarah.md".to_string(),
1263                "sources/emails/2026/05/a.md".to_string(),
1264                "wiki/people/sarah.md".to_string(),
1265            ]
1266        );
1267    }
1268
1269    #[test]
1270    fn walk_includes_content_named_log_md_or_db_md_inside_a_layer() {
1271        let dir = empty_store();
1272        let root = dir.path();
1273        // A content file that merely happens to be named log.md / DB.md INSIDE a
1274        // layer is real content — those names are reserved only at the store root.
1275        write(
1276            root,
1277            "records/configs/log.md",
1278            &content_md("2026-05-01T00:00:00Z"),
1279        );
1280        write(
1281            root,
1282            "sources/docs/DB.md",
1283            &content_md("2026-05-02T00:00:00Z"),
1284        );
1285        // The derived catalog twin is still skipped at any depth.
1286        write(root, "records/configs/index.md", "---\ntype: index\n---\n");
1287        let store = open(&dir);
1288        let got = rels(&store.walk().unwrap());
1289        assert!(
1290            got.contains(&"records/configs/log.md".to_string()),
1291            "layer-internal log.md is content: {got:?}"
1292        );
1293        assert!(
1294            got.contains(&"sources/docs/DB.md".to_string()),
1295            "layer-internal DB.md is content: {got:?}"
1296        );
1297        assert!(
1298            !got.iter().any(|p| p.ends_with("index.md")),
1299            "index.md is still skipped: {got:?}"
1300        );
1301    }
1302
1303    #[test]
1304    fn walk_layer_is_scoped() {
1305        let dir = empty_store();
1306        let root = dir.path();
1307        write(
1308            root,
1309            "sources/emails/2026/05/a.md",
1310            &content_md("2026-05-01T00:00:00Z"),
1311        );
1312        write(
1313            root,
1314            "records/contacts/sarah.md",
1315            &content_md("2026-05-02T00:00:00Z"),
1316        );
1317        let store = open(&dir);
1318
1319        assert_eq!(
1320            rels(&store.walk_layer(Layer::Sources).unwrap()),
1321            vec!["sources/emails/2026/05/a.md".to_string()]
1322        );
1323        assert_eq!(
1324            rels(&store.walk_layer(Layer::Records).unwrap()),
1325            vec!["records/contacts/sarah.md".to_string()]
1326        );
1327        // A layer with no directory is empty, not an error.
1328        assert!(store.walk_layer(Layer::Wiki).unwrap().is_empty());
1329    }
1330
1331    #[test]
1332    fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1333        let dir = empty_store();
1334        let root = dir.path();
1335        write(
1336            root,
1337            "sources/emails/2026/05/a.md",
1338            &content_md("2026-05-01T00:00:00Z"),
1339        );
1340        write(
1341            root,
1342            "sources/emails/2026/06/b.md",
1343            &content_md("2026-06-01T00:00:00Z"),
1344        );
1345        write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1346                                                                           // A different type folder must not leak in.
1347        write(
1348            root,
1349            "sources/docs/2026/05/c.md",
1350            &content_md("2026-05-04T00:00:00Z"),
1351        );
1352        let store = open(&dir);
1353
1354        let expected = vec![
1355            "sources/emails/2026/05/a.md".to_string(),
1356            "sources/emails/2026/06/b.md".to_string(),
1357        ];
1358        // Relative folder arg.
1359        assert_eq!(
1360            rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1361            expected
1362        );
1363        // Absolute folder arg under the store resolves identically.
1364        assert_eq!(
1365            rels(
1366                &store
1367                    .walk_type_folder(&root.join("sources/emails"))
1368                    .unwrap()
1369            ),
1370            expected
1371        );
1372    }
1373
1374    // ── recent_in_type_folder ────────────────────────────────────────────────
1375
1376    #[test]
1377    fn recent_orders_by_updated_desc_then_path_and_caps() {
1378        let dir = empty_store();
1379        let root = dir.path();
1380        // newest
1381        write(
1382            root,
1383            "records/meetings/2026/05/c.md",
1384            &content_md("2026-05-03T00:00:00Z"),
1385        );
1386        // tie on updated — path asc decides (a before b)
1387        write(
1388            root,
1389            "records/meetings/2026/05/a.md",
1390            &content_md("2026-05-02T00:00:00Z"),
1391        );
1392        write(
1393            root,
1394            "records/meetings/2026/05/b.md",
1395            &content_md("2026-05-02T00:00:00Z"),
1396        );
1397        // oldest
1398        write(
1399            root,
1400            "records/meetings/2026/04/z.md",
1401            &content_md("2026-04-01T00:00:00Z"),
1402        );
1403        let store = open(&dir);
1404
1405        let all = rels(
1406            &store
1407                .recent_in_type_folder(Path::new("records/meetings"), 10)
1408                .unwrap(),
1409        );
1410        assert_eq!(
1411            all,
1412            vec![
1413                "records/meetings/2026/05/c.md".to_string(), // newest
1414                "records/meetings/2026/05/a.md".to_string(), // tie, path asc
1415                "records/meetings/2026/05/b.md".to_string(),
1416                "records/meetings/2026/04/z.md".to_string(), // oldest
1417            ]
1418        );
1419
1420        // Cap takes the n most-recent.
1421        let top2 = rels(
1422            &store
1423                .recent_in_type_folder(Path::new("records/meetings"), 2)
1424                .unwrap(),
1425        );
1426        assert_eq!(
1427            top2,
1428            vec![
1429                "records/meetings/2026/05/c.md".to_string(),
1430                "records/meetings/2026/05/a.md".to_string(),
1431            ]
1432        );
1433    }
1434
1435    #[test]
1436    fn recent_sorts_undated_files_last() {
1437        let dir = empty_store();
1438        let root = dir.path();
1439        write(
1440            root,
1441            "records/contacts/dated.md",
1442            &content_md("2026-05-01T00:00:00Z"),
1443        );
1444        // No `updated` field at all.
1445        write(
1446            root,
1447            "records/contacts/undated.md",
1448            "---\ntype: contact\nsummary: x\n---\nbody\n",
1449        );
1450        let store = open(&dir);
1451        let got = rels(
1452            &store
1453                .recent_in_type_folder(Path::new("records/contacts"), 10)
1454                .unwrap(),
1455        );
1456        assert_eq!(
1457            got,
1458            vec![
1459                "records/contacts/dated.md".to_string(),
1460                "records/contacts/undated.md".to_string(),
1461            ],
1462            "a file with a real `updated` must outrank one with none"
1463        );
1464    }
1465
1466    // ── type_shards ──────────────────────────────────────────────────────────
1467
1468    #[test]
1469    fn type_shards_classification() {
1470        let dir = empty_store();
1471        let store = open(&dir);
1472        for t in [
1473            "email",
1474            "transcript",
1475            "pdf-source",
1476            "expense",
1477            "invoice",
1478            "meeting",
1479            "order",
1480            "ticket",
1481            "transaction",
1482        ] {
1483            assert!(store.type_shards(t), "{t} should shard");
1484        }
1485        for t in [
1486            "contact",
1487            "company",
1488            "decision",
1489            "wiki-page",
1490            "index",
1491            "log",
1492            "db-md",
1493            "proposal",
1494        ] {
1495            assert!(!store.type_shards(t), "{t} should stay flat");
1496        }
1497    }
1498
1499    #[test]
1500    fn type_shards_respects_schema_directive_both_directions() {
1501        use crate::parser::{Config, Schema};
1502        let dir = empty_store();
1503        let mut store = open(&dir);
1504        let mut config = Config::default();
1505        // A CUSTOM type (not in the built-in list) opts into date-sharding —
1506        // without the schema override `type_shards` would return false for it.
1507        config.schemas.insert(
1508            "shipment".to_string(),
1509            Schema {
1510                shard: Some(true),
1511                ..Schema::default()
1512            },
1513        );
1514        // A BUILT-IN event type opts OUT (flat) — the override wins over the
1515        // built-in default.
1516        config.schemas.insert(
1517            "expense".to_string(),
1518            Schema {
1519                shard: Some(false),
1520                ..Schema::default()
1521            },
1522        );
1523        // A schema with no `shard:` directive leaves the built-in default intact.
1524        config
1525            .schemas
1526            .insert("meeting".to_string(), Schema::default());
1527        store.config = config;
1528
1529        assert!(
1530            store.type_shards("shipment"),
1531            "custom type with `shard: by-date` must shard"
1532        );
1533        assert!(
1534            !store.type_shards("expense"),
1535            "built-in event type with `shard: flat` must go flat"
1536        );
1537        assert!(
1538            store.type_shards("meeting"),
1539            "schema without a `shard:` directive keeps the built-in default"
1540        );
1541        assert!(
1542            !store.type_shards("contact"),
1543            "unconfigured entity type stays flat"
1544        );
1545    }
1546
1547    // ── shard_path_for ───────────────────────────────────────────────────────
1548
1549    fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
1550        let mut fm = Frontmatter::default();
1551        fm.extra.insert(
1552            key.to_string(),
1553            serde_norway::Value::String(value.to_string()),
1554        );
1555        fm
1556    }
1557
1558    fn fm_with_created(rfc3339: &str) -> Frontmatter {
1559        Frontmatter {
1560            created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
1561            ..Default::default()
1562        }
1563    }
1564
1565    #[test]
1566    fn shard_path_uses_primary_date_field_per_type() {
1567        let dir = empty_store();
1568        let store = open(&dir);
1569
1570        // expense.date → records/expenses/<YYYY>/<MM>/
1571        let p = store
1572            .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
1573            .unwrap();
1574        assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
1575
1576        // email.date → sources/emails/<YYYY>/<MM>/
1577        let p = store
1578            .shard_path_for(
1579                "email",
1580                &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
1581                "e1",
1582            )
1583            .unwrap();
1584        assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
1585
1586        // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
1587        let p = store
1588            .shard_path_for(
1589                "transcript",
1590                &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
1591                "t1",
1592            )
1593            .unwrap();
1594        assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
1595    }
1596
1597    #[test]
1598    fn shard_path_falls_back_to_created() {
1599        let dir = empty_store();
1600        let store = open(&dir);
1601        // meeting with no `date` field but a `created` timestamp.
1602        let p = store
1603            .shard_path_for(
1604                "meeting",
1605                &fm_with_created("2024-07-09T08:30:00-04:00"),
1606                "sync",
1607            )
1608            .unwrap();
1609        assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
1610    }
1611
1612    #[test]
1613    fn shard_path_primary_field_wins_over_created() {
1614        let dir = empty_store();
1615        let store = open(&dir);
1616        let mut fm = fm_with_created("2020-01-01T00:00:00Z");
1617        fm.extra.insert(
1618            "date".into(),
1619            serde_norway::Value::String("2026-05-22".into()),
1620        );
1621        let p = store.shard_path_for("expense", &fm, "x").unwrap();
1622        // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
1623        assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
1624    }
1625
1626    #[test]
1627    fn shard_path_flat_types_have_no_shard_segment() {
1628        let dir = empty_store();
1629        let store = open(&dir);
1630        // A contact has a `created` date, but contacts stay flat.
1631        let p = store
1632            .shard_path_for(
1633                "contact",
1634                &fm_with_created("2026-05-22T00:00:00Z"),
1635                "sarah-chen",
1636            )
1637            .unwrap();
1638        assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
1639
1640        // wiki-page is flat (no date shard) but still files under a type-folder:
1641        // `wiki/topics/<name>.md`, NEVER flat as `wiki/<name>.md`. A 2-component
1642        // path is invisible to the index/validate type-folder model.
1643        let p = store
1644            .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1645            .unwrap();
1646        assert_eq!(p, PathBuf::from("wiki/topics/renewal-theme.md"));
1647    }
1648
1649    /// Regression: a wiki-page written through the toolkit's own path
1650    /// computation must land at a path the index + validate type-folder model
1651    /// accepts. `shard_path_for("wiki-page", …)` previously returned a
1652    /// 2-component `wiki/<file>` path, which `type_folder_of` (in both `index`
1653    /// and `validate`) treats as "no type-folder" — so the page either crashed
1654    /// `Index::on_write` (it tried to create `index.md` inside a file) or was
1655    /// silently dropped from every catalog by `Index::rebuild_all`. The
1656    /// computed path must have 3 components: `<layer>/<type-folder>/<file>`.
1657    #[test]
1658    fn shard_path_wiki_page_is_indexable_three_component_path() {
1659        let dir = empty_store();
1660        let store = open(&dir);
1661        let p = store
1662            .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1663            .unwrap();
1664        // First two components are a layer + a non-empty type-folder segment;
1665        // the file is the third. This is exactly the shape `type_folder_of`
1666        // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
1667        let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
1668        assert_eq!(
1669            comps.len(),
1670            3,
1671            "wiki-page path must be <layer>/<type-folder>/<file>, got {p:?}"
1672        );
1673        assert_eq!(comps[0], "wiki", "first component must be the wiki layer");
1674        assert!(
1675            !comps[1].is_empty() && comps[1] != "renewal-theme.md",
1676            "second component must be a real type-folder, not the file: {p:?}"
1677        );
1678        assert!(
1679            comps[2].ends_with(".md"),
1680            "third component must be the .md file: {p:?}"
1681        );
1682    }
1683
1684    #[test]
1685    fn shard_path_preserves_and_adds_md_extension() {
1686        let dir = empty_store();
1687        let store = open(&dir);
1688        let with = store
1689            .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
1690            .unwrap();
1691        let without = store
1692            .shard_path_for("contact", &Frontmatter::default(), "sarah")
1693            .unwrap();
1694        assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
1695        assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
1696    }
1697
1698    #[test]
1699    fn shard_path_errors_when_sharding_type_has_no_date() {
1700        let dir = empty_store();
1701        let store = open(&dir);
1702        // expense shards, but no `date` and no `created` → NoShardDate.
1703        let err = store
1704            .shard_path_for("expense", &Frontmatter::default(), "mystery")
1705            .unwrap_err();
1706        match err {
1707            StoreError::NoShardDate { file } => {
1708                assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
1709            }
1710            other => panic!("expected NoShardDate, got {other:?}"),
1711        }
1712    }
1713
1714    // ── find_links_to ────────────────────────────────────────────────────────
1715
1716    #[test]
1717    fn find_links_to_matches_all_accepted_spellings() {
1718        let dir = empty_store();
1719        let root = dir.path();
1720        let target = "records/contacts/sarah-chen";
1721
1722        // Plain link.
1723        write(
1724            root,
1725            "wiki/people/sarah.md",
1726            &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1727        );
1728        // Link with display text.
1729        write(
1730            root,
1731            "records/meetings/2026/05/m.md",
1732            &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
1733        );
1734        // Link with .md extension (accepted, warned by validate).
1735        write(
1736            root,
1737            "wiki/themes/t.md",
1738            &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}.md]]\n"),
1739        );
1740        // A catalog/index file also contains the link literally — included.
1741        write(
1742            root,
1743            "records/contacts/index.md",
1744            &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
1745        );
1746        // No link to the target.
1747        write(
1748            root,
1749            "wiki/people/elena.md",
1750            "---\ntype: wiki-page\nsummary: s\n---\nNo links here.\n",
1751        );
1752        // Short-form link must NOT match the full-path target.
1753        write(
1754            root,
1755            "wiki/people/bob.md",
1756            "---\ntype: wiki-page\nsummary: s\n---\n[[sarah-chen]]\n",
1757        );
1758        // A longer path that merely starts with the target must NOT match
1759        // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
1760        write(
1761            root,
1762            "wiki/people/jr.md",
1763            &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}-jr]]\n"),
1764        );
1765
1766        let store = open(&dir);
1767        let got = rels(&store.find_links_to(Path::new(target)).unwrap());
1768        assert_eq!(
1769            got,
1770            vec![
1771                "records/contacts/index.md".to_string(),
1772                "records/meetings/2026/05/m.md".to_string(),
1773                "wiki/people/sarah.md".to_string(),
1774                "wiki/themes/t.md".to_string(),
1775            ]
1776        );
1777    }
1778
1779    #[test]
1780    fn find_links_to_distinguishes_sibling_paths() {
1781        // Two contacts whose paths share a prefix; a link to one must not be
1782        // reported as a link to the other.
1783        let dir = empty_store();
1784        let root = dir.path();
1785        write(
1786            root,
1787            "wiki/a.md",
1788            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah]]\n",
1789        );
1790        write(
1791            root,
1792            "wiki/b.md",
1793            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1794        );
1795        let store = open(&dir);
1796
1797        assert_eq!(
1798            rels(
1799                &store
1800                    .find_links_to(Path::new("records/contacts/sarah"))
1801                    .unwrap()
1802            ),
1803            vec!["wiki/a.md".to_string()]
1804        );
1805        assert_eq!(
1806            rels(
1807                &store
1808                    .find_links_to(Path::new("records/contacts/sarah-chen"))
1809                    .unwrap()
1810            ),
1811            vec!["wiki/b.md".to_string()]
1812        );
1813    }
1814
1815    // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
1816
1817    /// The working-set validate's incoming-linker discovery runs through
1818    /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
1819    /// batch contract that makes that single-pass behavior correct: the result is
1820    /// the union of incoming linkers across every target, with per-target
1821    /// boundary correctness preserved (no alternation arm bleeds into a
1822    /// prefix-sharing sibling). If a regression reverts the batch finder to a
1823    /// per-object loop, the union below would still hold — but the boundary +
1824    /// union-equivalence assertions are what guard the *correctness* of folding N
1825    /// scans into one regex.
1826    #[test]
1827    fn find_links_to_any_returns_the_union_with_boundary_correctness() {
1828        let dir = empty_store();
1829        let root = dir.path();
1830
1831        // Two distinct targets, each with its own linker.
1832        write(
1833            root,
1834            "wiki/links-sarah.md",
1835            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1836        );
1837        write(
1838            root,
1839            "wiki/links-acme.md",
1840            "---\ntype: wiki-page\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
1841        );
1842        // One file links to BOTH targets — must appear exactly once (deduped),
1843        // proving the per-file early-exit folds multiple-target hits into a
1844        // single result row rather than one row per matched target.
1845        write(
1846            root,
1847            "records/meetings/2026/05/m.md",
1848            "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
1849             [[records/companies/acme]]\n",
1850        );
1851        // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
1852        // be reported as a link to `sarah-chen` even though the alternation now
1853        // carries `sarah-chen` as one arm.
1854        write(
1855            root,
1856            "wiki/links-jr.md",
1857            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
1858        );
1859        // A file that links to neither requested target.
1860        write(
1861            root,
1862            "wiki/unrelated.md",
1863            "---\ntype: wiki-page\nsummary: s\n---\n[[wiki/themes/spend]]\n",
1864        );
1865
1866        let store = open(&dir);
1867        let targets = vec![
1868            PathBuf::from("records/contacts/sarah-chen"),
1869            PathBuf::from("records/companies/acme"),
1870        ];
1871
1872        let got = rels(&store.find_links_to_any(&targets).unwrap());
1873        assert_eq!(
1874            got,
1875            vec![
1876                "records/meetings/2026/05/m.md".to_string(),
1877                "wiki/links-acme.md".to_string(),
1878                "wiki/links-sarah.md".to_string(),
1879            ],
1880            "batch finder must return the deduped union of linkers across all \
1881             targets, excluding the prefix-sibling and the unrelated file"
1882        );
1883
1884        // Equivalence: the batch result must equal the union of the per-target
1885        // single finder. This is the property the working-set path relies on
1886        // when it folds one-scan-per-object into one scan for the whole set.
1887        let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
1888        for t in &targets {
1889            for linker in store.find_links_to(t).unwrap() {
1890                union.insert(linker);
1891            }
1892        }
1893        assert_eq!(
1894            rels(&union.into_iter().collect::<Vec<_>>()),
1895            got,
1896            "find_links_to_any must equal the union of per-target find_links_to"
1897        );
1898    }
1899
1900    /// An empty target set must scan nothing and find nothing — and crucially
1901    /// must NOT compile to a match-everything empty regex (which would report
1902    /// every `.md` as a linker). This is the empty-working-set fast path the
1903    /// `validate` loop hits when nothing changed.
1904    #[test]
1905    fn find_links_to_any_empty_targets_matches_nothing() {
1906        let dir = empty_store();
1907        let root = dir.path();
1908        write(
1909            root,
1910            "wiki/a.md",
1911            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1912        );
1913        let store = open(&dir);
1914
1915        assert!(
1916            store.find_links_to_any(&[]).unwrap().is_empty(),
1917            "no targets ⇒ no linkers (an empty pattern must not match every file)"
1918        );
1919        // A set of only empty/non-link targets is likewise a no-op, not a
1920        // match-everything.
1921        assert!(
1922            store
1923                .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
1924                .unwrap()
1925                .is_empty(),
1926            "targets that render to empty link text contribute no alternation arm"
1927        );
1928    }
1929
1930    // ── read_type_index ──────────────────────────────────────────────────────
1931
1932    #[test]
1933    fn read_type_index_parses_records_and_flattens_fields() {
1934        let dir = empty_store();
1935        let root = dir.path();
1936        let jsonl = "\
1937{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
1938{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
1939";
1940        let p = write(root, "records/expenses/index.jsonl", jsonl);
1941        let store = open(&dir);
1942        let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1943
1944        assert_eq!(recs.len(), 2);
1945        // Sorted by path asc.
1946        assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
1947        assert_eq!(recs[0].type_, "expense");
1948        assert_eq!(recs[0].summary, "lunch");
1949        assert_eq!(recs[0].tags, vec!["meals".to_string()]);
1950        assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
1951        assert!(recs[0].created.is_some());
1952        // Extra (non-typed) frontmatter flattens into `fields`.
1953        assert_eq!(
1954            recs[0].fields.get("vendor"),
1955            Some(&serde_json::json!("acme"))
1956        );
1957        assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
1958        // Defaults: missing tags/links → empty.
1959        assert!(recs[1].tags.is_empty());
1960        assert!(recs[1].links.is_empty());
1961    }
1962
1963    #[test]
1964    fn read_type_index_last_write_wins_and_skips_blanks() {
1965        let dir = empty_store();
1966        let root = dir.path();
1967        // Same path twice; the second line supersedes the first. A blank line
1968        // in between must be ignored, not error.
1969        let jsonl = "\
1970{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
1971
1972{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
1973";
1974        let p = write(root, "records/contacts/index.jsonl", jsonl);
1975        let store = open(&dir);
1976        let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1977        assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
1978        assert_eq!(recs[0].summary, "new", "later line must win");
1979    }
1980
1981    #[test]
1982    fn read_type_index_errors_on_malformed_line() {
1983        let dir = empty_store();
1984        let root = dir.path();
1985        let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
1986        let store = open(&dir);
1987        let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
1988        assert!(matches!(err, StoreError::BadTypeIndex { .. }));
1989    }
1990
1991    // ── find_by_type / find_by_where ─────────────────────────────────────────
1992
1993    fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
1994        format!(
1995            "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
1996        )
1997    }
1998
1999    #[test]
2000    fn find_by_type_reads_canonical_folder_sidecar() {
2001        let dir = empty_store();
2002        let root = dir.path();
2003        // Canonical folder for `contact` is records/contacts.
2004        write(
2005            root,
2006            "records/contacts/index.jsonl",
2007            &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
2008                + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
2009        );
2010        // A different type's sidecar must not leak into a contact query.
2011        write(
2012            root,
2013            "records/companies/index.jsonl",
2014            &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2015        );
2016        let store = open(&dir);
2017        let recs = store.find_by_type("contact").unwrap();
2018        let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
2019        assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
2020        assert!(recs.iter().all(|r| r.type_ == "contact"));
2021    }
2022
2023    #[test]
2024    fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
2025        let dir = empty_store();
2026        let root = dir.path();
2027        // A custom `proposal` record filed under a non-canonical folder NAME
2028        // (the natural plural `records/proposals/`) inside the records layer.
2029        // `default_type_folder("proposal")` = `records/proposal` (bare type, no
2030        // pluralization guess), so the canonical sidecar does not exist and
2031        // `find_by_type` falls back. The fallback is bounded to the type's
2032        // layer (records), so this record — same layer, non-canonical folder —
2033        // is still found: completeness within the layer holds.
2034        write(
2035            root,
2036            "records/proposals/index.jsonl",
2037            &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
2038        );
2039        // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
2040        // old whole-store fallback read every sidecar in the store and would
2041        // have leaked this into the result; the layer-bounded fallback must not.
2042        // It also pins that the fallback is O(entities-in-layer), never O(store).
2043        write(
2044            root,
2045            "sources/proposals/index.jsonl",
2046            &jsonl_line(
2047                "sources/proposals/leak.md",
2048                "proposal",
2049                "cross-layer decoy",
2050                "",
2051            ),
2052        );
2053        let store = open(&dir);
2054        let recs = store.find_by_type("proposal").unwrap();
2055        assert_eq!(
2056            recs.len(),
2057            1,
2058            "only the records-layer proposal, not the sources decoy"
2059        );
2060        assert_eq!(recs[0].summary, "Q3 proposal");
2061        assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
2062    }
2063
2064    #[test]
2065    fn find_by_type_canonical_absent_does_not_read_other_layers() {
2066        let dir = empty_store();
2067        let root = dir.path();
2068        // `email`'s canonical folder is `sources/emails` (layer Sources). No
2069        // sidecar there yet, so `find_by_type("email")` falls back — but only
2070        // within the Sources layer. A populated sidecar in the Records layer
2071        // must never be touched: the fallback is layer-bounded, not store-wide.
2072        // Under the old `read_all_type_indexes_in(None)` fallback this records
2073        // sidecar would have been read and filtered (wasted O(store) I/O); now
2074        // it is outside the walk root entirely.
2075        write(
2076            root,
2077            "records/contacts/index.jsonl",
2078            &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2079        );
2080        let store = open(&dir);
2081        // No email anywhere ⇒ empty, and the records layer was not in scope.
2082        assert!(store.find_by_type("email").unwrap().is_empty());
2083    }
2084
2085    #[test]
2086    fn find_by_where_matches_typed_columns_and_flat_fields() {
2087        let dir = empty_store();
2088        let root = dir.path();
2089        write(
2090            root,
2091            "records/expenses/index.jsonl",
2092            &(jsonl_line(
2093                "records/expenses/a.md",
2094                "expense",
2095                "lunch",
2096                ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
2097            ) + &jsonl_line(
2098                "records/expenses/b.md",
2099                "expense",
2100                "taxi",
2101                ",\"vendor\":\"yellow\"",
2102            )),
2103        );
2104        write(
2105            root,
2106            "records/contacts/index.jsonl",
2107            &jsonl_line(
2108                "records/contacts/sarah.md",
2109                "contact",
2110                "Sarah",
2111                ",\"tags\":[\"customer\"]",
2112            ),
2113        );
2114        let store = open(&dir);
2115
2116        // Flat field in `fields`.
2117        let by_vendor = store.find_by_where("vendor", "acme").unwrap();
2118        assert_eq!(by_vendor.len(), 1);
2119        assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
2120
2121        // Typed column: type (spans both expense records).
2122        assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
2123
2124        // Typed list column: tags membership.
2125        let customers = store.find_by_where("tags", "customer").unwrap();
2126        assert_eq!(customers.len(), 1);
2127        assert_eq!(
2128            customers[0].path,
2129            PathBuf::from("records/contacts/sarah.md")
2130        );
2131
2132        // No match → empty.
2133        assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2134    }
2135
2136    #[test]
2137    fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2138        let dir = empty_store();
2139        let root = dir.path();
2140        // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2141        // serialized from such a file preserves it verbatim.
2142        write(
2143            root,
2144            "records/meetings/index.jsonl",
2145            "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2146\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2147\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2148        );
2149        let store = open(&dir);
2150
2151        // The exact value an agent reads out of the file (`Z` form) must match.
2152        let by_z = store
2153            .find_by_where("created", "2026-05-01T00:00:00Z")
2154            .unwrap();
2155        assert_eq!(by_z.len(), 1);
2156        assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2157
2158        // The equivalent explicit-offset spelling of the same instant matches too.
2159        assert_eq!(
2160            store
2161                .find_by_where("created", "2026-05-01T00:00:00+00:00")
2162                .unwrap()
2163                .len(),
2164            1
2165        );
2166
2167        // A non-UTC stored value matches both its own offset spelling and the
2168        // same instant expressed as `Z` (instant comparison, not string compare).
2169        assert_eq!(
2170            store
2171                .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2172                .unwrap()
2173                .len(),
2174            1
2175        );
2176        assert_eq!(
2177            store
2178                .find_by_where("updated", "2026-05-02T16:30:00Z")
2179                .unwrap()
2180                .len(),
2181            1
2182        );
2183
2184        // A different instant does not match.
2185        assert!(store
2186            .find_by_where("created", "2026-05-01T00:00:01Z")
2187            .unwrap()
2188            .is_empty());
2189        // A non-RFC3339 query value never matches a real timestamp.
2190        assert!(store
2191            .find_by_where("created", "2026-05-01")
2192            .unwrap()
2193            .is_empty());
2194    }
2195
2196    #[test]
2197    fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
2198        // The O(entities-in-layer) contract: a layer-scoped where read must walk
2199        // ONLY the named layer's subtree. Proven structurally — a *malformed*
2200        // sidecar in another layer would make `read_type_index` error if it were
2201        // read, so a scoped read that succeeds (and excludes that record) is
2202        // proof the other layer's I/O never happened.
2203        let dir = empty_store();
2204        let root = dir.path();
2205        write(
2206            root,
2207            "records/companies/index.jsonl",
2208            &jsonl_line(
2209                "records/companies/acme.md",
2210                "company",
2211                "Acme",
2212                ",\"domain\":\"acme.com\"",
2213            ),
2214        );
2215        // Same field/value in the sources layer — but the sidecar is corrupt.
2216        write(
2217            root,
2218            "sources/emails/index.jsonl",
2219            "{ this is not valid json and would error if read }\n",
2220        );
2221        let store = open(&dir);
2222
2223        // Scoped to records: the corrupt sources sidecar is out of scope, so the
2224        // read succeeds and returns only the records-layer match.
2225        let in_records = store
2226            .find_by_where_in("domain", "acme.com", Some(Layer::Records))
2227            .expect("a records-scoped read must not touch the sources sidecar");
2228        assert_eq!(
2229            rels(
2230                &in_records
2231                    .iter()
2232                    .map(|r| r.path.clone())
2233                    .collect::<Vec<_>>()
2234            ),
2235            vec!["records/companies/acme.md".to_string()]
2236        );
2237
2238        // The store-wide read DOES reach the corrupt sidecar and surfaces it as
2239        // a parse error — confirming the corrupt file is genuinely in the tree
2240        // and that only the layer scope spares it.
2241        let store_wide = store.find_by_where("domain", "acme.com");
2242        assert!(
2243            matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
2244            "unscoped read walks every layer and hits the corrupt sidecar"
2245        );
2246
2247        // Scoping to the layer that holds only the corrupt sidecar still errors
2248        // (the scope includes it), proving the scope is a real subtree bound and
2249        // not a silent "skip anything that fails".
2250        let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
2251        assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
2252    }
2253
2254    #[test]
2255    fn find_by_where_in_missing_layer_is_empty_not_an_error() {
2256        // A layer-scoped read over a layer folder that does not exist yet must
2257        // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
2258        // error from `ignore` over a nonexistent path.
2259        let dir = empty_store();
2260        let root = dir.path();
2261        write(
2262            root,
2263            "records/contacts/index.jsonl",
2264            &jsonl_line(
2265                "records/contacts/sarah.md",
2266                "contact",
2267                "Sarah",
2268                ",\"city\":\"denver\"",
2269            ),
2270        );
2271        let store = open(&dir);
2272
2273        // `wiki/` was never created.
2274        let in_wiki = store
2275            .find_by_where_in("city", "denver", Some(Layer::Wiki))
2276            .expect("missing layer subtree is empty, not an error");
2277        assert!(in_wiki.is_empty());
2278
2279        // Same query scoped to the layer that has the record still finds it.
2280        let in_records = store
2281            .find_by_where_in("city", "denver", Some(Layer::Records))
2282            .unwrap();
2283        assert_eq!(in_records.len(), 1);
2284    }
2285
2286    // ── abs_path / rel_path ──────────────────────────────────────────────────
2287
2288    #[test]
2289    fn abs_and_rel_path_roundtrip() {
2290        let dir = empty_store();
2291        let store = open(&dir);
2292        let rel = Path::new("records/contacts/sarah.md");
2293        let abs = store.abs_path(rel);
2294        assert_eq!(abs, dir.path().join(rel));
2295        assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
2296
2297        // An absolute path is passed through unchanged by abs_path.
2298        assert_eq!(store.abs_path(&abs), abs);
2299
2300        // A path outside the store has no store-relative form.
2301        assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
2302    }
2303
2304    // ── infer_type_from_path (inverse of default_type_folder) ────────────────
2305
2306    #[test]
2307    fn infer_type_maps_every_recognized_folder_back_to_its_type() {
2308        let cases = [
2309            ("sources/emails/x.md", "email"),
2310            ("sources/transcripts/x.md", "transcript"),
2311            ("sources/docs/x.md", "pdf-source"),
2312            ("records/contacts/x.md", "contact"),
2313            ("records/companies/x.md", "company"),
2314            ("records/expenses/x.md", "expense"),
2315            ("records/meetings/x.md", "meeting"),
2316            ("records/decisions/x.md", "decision"),
2317            ("records/invoices/x.md", "invoice"),
2318            // Any wiki sub-folder infers `wiki-page` regardless of the topic name.
2319            ("wiki/topics/x.md", "wiki-page"),
2320            ("wiki/pricing/x.md", "wiki-page"),
2321        ];
2322        for (path, expected) in cases {
2323            assert_eq!(
2324                infer_type_from_path(Path::new(path)).as_deref(),
2325                Some(expected),
2326                "path {path} should infer type {expected}"
2327            );
2328        }
2329    }
2330
2331    #[test]
2332    fn infer_type_round_trips_with_default_type_folder() {
2333        // The canonical invariant: inference is the inverse of the forward map.
2334        // Every recognized type, routed through `default_type_folder` and then
2335        // back through `infer_type_from_path`, must return the original type.
2336        // `wiki-page` is the one many-to-one case (every topic folder maps back
2337        // to `wiki-page`), so its forward folder still round-trips.
2338        let recognized = [
2339            "email",
2340            "transcript",
2341            "pdf-source",
2342            "contact",
2343            "company",
2344            "expense",
2345            "meeting",
2346            "decision",
2347            "invoice",
2348            "wiki-page",
2349        ];
2350        for type_ in recognized {
2351            let folder = default_type_folder(type_);
2352            let file = folder.join("x.md");
2353            assert_eq!(
2354                infer_type_from_path(&file).as_deref(),
2355                Some(type_),
2356                "recognized type {type_} (folder {folder:?}) must round-trip"
2357            );
2358        }
2359    }
2360
2361    #[test]
2362    fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
2363        // Regression guard for the CLI/core divergence: `default_type_folder`'s
2364        // unrecognized fallback is the BARE type name (`task → records/task`,
2365        // `tasks → records/tasks`). Inference must NOT singularize, or a custom
2366        // type would not round-trip (e.g. `records/tasks` → `task` would clash
2367        // with `default_type_folder("task") → records/task`).
2368        for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
2369            let folder = default_type_folder(custom);
2370            assert_eq!(folder, PathBuf::from("records").join(custom));
2371            let file = folder.join("x.md");
2372            assert_eq!(
2373                infer_type_from_path(&file).as_deref(),
2374                Some(custom),
2375                "custom type {custom} must round-trip verbatim (no singularization)"
2376            );
2377        }
2378
2379        // The specific case named in the finding: a plural custom folder keeps
2380        // its trailing `s`; it is NOT singularized to `task`.
2381        assert_eq!(
2382            infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
2383            Some("tasks"),
2384            "records/tasks must infer `tasks`, not `task`"
2385        );
2386    }
2387
2388    #[test]
2389    fn infer_type_requires_three_component_layer_folder_file_shape() {
2390        // Fewer than 3 components: a file directly under a layer has no
2391        // type-folder, so inference yields None (matches the old CLI contract).
2392        assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
2393        assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
2394        assert_eq!(infer_type_from_path(Path::new("wiki/x.md")), None);
2395        assert_eq!(infer_type_from_path(Path::new("x.md")), None);
2396        // Unknown leading layer is never inferred.
2397        assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
2398        // Deeper paths still infer from the first type-folder segment (e.g. a
2399        // sharded record under records/expenses/2026/05/x.md).
2400        assert_eq!(
2401            infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
2402            Some("expense"),
2403        );
2404    }
2405}