Skip to main content

dbmd_core/
store.rs

1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (embedded ripgrep,
12//! presence-only) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18
19use std::collections::BTreeMap;
20use std::path::{Path, PathBuf};
21
22use chrono::{DateTime, Datelike, FixedOffset};
23use grep::regex::RegexMatcher;
24use grep::searcher::sinks::Lossy;
25use grep::searcher::Searcher;
26use ignore::WalkBuilder;
27
28use crate::index::IndexRecord;
29use crate::parser::{parse_db_md, Config, Frontmatter};
30
31/// Basenames that are never content files: the config marker and the two
32/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
33/// content layers never mistakes a catalog for a record.
34///
35/// Only `index.md` is excluded by basename, because the content walks traverse
36/// the layer dirs (`sources/`/`records/`/`wiki/`) and `index.md` is the only
37/// meta file that appears INSIDE them. The root `DB.md` / `log.md` (and the
38/// `log/` archive) live at the store root, outside every layer, so they are
39/// never reached by these walks — and a content file that merely happens to be
40/// named `DB.md` or `log.md` inside a layer (e.g. `records/docs/DB.md`) is real
41/// content the SPEC does NOT reserve at type-folder depth.
42const NON_CONTENT_BASENAMES: [&str; 1] = ["index.md"];
43
44/// The complete machine-twin sidecar that backs every structured read.
45const TYPE_INDEX_FILE: &str = "index.jsonl";
46
47/// Returned when a path is opened as a store but has no `DB.md` at its root.
48/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
49#[derive(Debug, thiserror::Error)]
50#[error("not a db.md store: {path} has no DB.md")]
51pub struct NotAStore {
52    /// The path that was inspected.
53    pub path: PathBuf,
54}
55
56/// Errors from store-level operations (walk, locate, shard, sidecar read).
57#[derive(Debug, thiserror::Error)]
58pub enum StoreError {
59    /// A sidecar `index.jsonl` could not be read or parsed.
60    #[error("failed to read type index {path}: {message}")]
61    BadTypeIndex {
62        /// The sidecar file.
63        path: PathBuf,
64        /// What went wrong.
65        message: String,
66    },
67
68    /// A required date field for sharding was absent or unparseable, and there
69    /// was no usable fallback.
70    #[error("cannot compute shard path for {file}: no usable date field")]
71    NoShardDate {
72        /// The file being placed.
73        file: PathBuf,
74    },
75
76    /// An embedded-ripgrep scan failed to start or run.
77    #[error("search failed under {root}: {message}")]
78    Search {
79        /// The root the scan ran under.
80        root: PathBuf,
81        /// What went wrong.
82        message: String,
83    },
84
85    /// An underlying I/O failure.
86    #[error(transparent)]
87    Io(#[from] std::io::Error),
88}
89
90/// The three canonical layers of a db.md store.
91///
92/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
93/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
94/// declaration order (`Sources` < `Records` < `Wiki`) is the sort order.
95#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
96pub enum Layer {
97    /// `sources/` — raw evidence; immutable; date-sharded at scale.
98    Sources,
99    /// `records/` — atomic typed data; entity types flat, event types sharded.
100    Records,
101    /// `wiki/` — curator-synthesized narrative; flat.
102    Wiki,
103}
104
105impl Layer {
106    /// The on-disk folder name for this layer (`"sources"` / `"records"` /
107    /// `"wiki"`).
108    pub fn dir_name(self) -> &'static str {
109        match self {
110            Layer::Sources => "sources",
111            Layer::Records => "records",
112            Layer::Wiki => "wiki",
113        }
114    }
115
116    /// Parse a layer from its folder name; `None` for anything else.
117    pub fn from_dir_name(name: &str) -> Option<Self> {
118        match name {
119            "sources" => Some(Layer::Sources),
120            "records" => Some(Layer::Records),
121            "wiki" => Some(Layer::Wiki),
122            _ => None,
123        }
124    }
125
126    /// Every layer, in canonical order.
127    pub fn all() -> [Layer; 3] {
128        [Layer::Sources, Layer::Records, Layer::Wiki]
129    }
130}
131
132/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
133///
134/// Construct via [`Store::open`]; that is the only path in, and it validates
135/// the `DB.md` marker so downstream code can assume a real store.
136#[derive(Debug, Clone)]
137pub struct Store {
138    /// The store root (the directory containing `DB.md`).
139    pub root: PathBuf,
140    /// The parsed `DB.md` config (agent instructions, policies, schemas).
141    pub config: Config,
142}
143
144impl Store {
145    /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
146    /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
147    /// count (the lowercase name refers to the project/spec, not the marker).
148    pub fn is_db_md_store(path: &Path) -> bool {
149        // Read the directory and match the *stored* filename byte-for-byte.
150        // `path.join("DB.md").exists()` would lie on a case-insensitive
151        // filesystem (macOS default), where a lowercase `db.md` answers a
152        // `DB.md` probe. `read_dir` returns the real on-disk name, so the
153        // exact-match check is correct on both case-sensitive (Linux) and
154        // case-insensitive filesystems.
155        let entries = match std::fs::read_dir(path) {
156            Ok(entries) => entries,
157            Err(_) => return false,
158        };
159        for entry in entries.flatten() {
160            if entry.file_name() == "DB.md" {
161                // A directory literally named `DB.md` is not the marker.
162                match entry.file_type() {
163                    Ok(ft) if ft.is_dir() => return false,
164                    Ok(_) => return true,
165                    Err(_) => return false,
166                }
167            }
168        }
169        false
170    }
171
172    /// Open `path` as a db.md store and require `DB.md` to be readable and
173    /// parseable. Normal commands should enter through this strict gate so a
174    /// damaged config cannot silently disable schema or policy rules.
175    pub fn open_strict(path: &Path) -> crate::Result<Store> {
176        if !Store::is_db_md_store(path) {
177            return Err(NotAStore {
178                path: path.to_path_buf(),
179            }
180            .into());
181        }
182        let db_md = path.join("DB.md");
183        let text = std::fs::read_to_string(&db_md)?;
184        let config = parse_db_md(&text, &db_md)?;
185        Ok(Store {
186            root: path.to_path_buf(),
187            config,
188        })
189    }
190
191    /// Open `path` as a db.md store: confirm the `DB.md` marker (else
192    /// [`NotAStore`]) and parse the `DB.md` config when possible. This is the
193    /// lenient validation-oriented open path: a damaged `DB.md` still marks the
194    /// directory as a store so `dbmd validate` can report the config error as an
195    /// issue. Normal CLI commands should use [`Store::open_strict`] instead.
196    pub fn open(path: &Path) -> Result<Store, NotAStore> {
197        if !Store::is_db_md_store(path) {
198            return Err(NotAStore {
199                path: path.to_path_buf(),
200            });
201        }
202        let db_md = path.join("DB.md");
203        // The marker exists; parse its config. A read or parse failure leaves
204        // the store openable with default config rather than masquerading as
205        // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
206        // DB.md is `dbmd validate`'s job to report, not `open`'s.
207        let config = match std::fs::read_to_string(&db_md) {
208            Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
209            Err(_) => Config::default(),
210        };
211        Ok(Store {
212            root: path.to_path_buf(),
213            config,
214        })
215    }
216
217    /// **SWEEP.** Recursively iterate every `.md` content file across
218    /// `sources/`, `records/`, and `wiki/`, skipping hidden dirs and `log/`.
219    /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
220    /// the interactive loop.
221    pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
222        // Only the three content layers — never root meta files (`DB.md`,
223        // `index.md`, `log.md`) and never `log/`, which live at root and are
224        // outside every layer dir.
225        let mut out = Vec::new();
226        for layer in Layer::all() {
227            out.extend(self.walk_layer(layer)?);
228        }
229        out.sort();
230        Ok(out)
231    }
232
233    /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
234    pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
235        let layer_root = self.root.join(layer.dir_name());
236        if !layer_root.is_dir() {
237            return Ok(Vec::new());
238        }
239        self.walk_content_md(&layer_root)
240    }
241
242    /// Enumerate every `.md` file in a single type-folder, **recursing through
243    /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
244    /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
245    pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
246        let abs = self.resolve_under_root(type_folder);
247        if !abs.is_dir() {
248            return Ok(Vec::new());
249        }
250        self.walk_content_md(&abs)
251    }
252
253    /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
254    /// (descending), ties broken by store-relative path (ascending) — a total
255    /// order, so write-through and rebuild never disagree on #500 vs #501.
256    ///
257    /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
258    /// `index rebuild`. The write-through path never calls this. The
259    /// cap-selection primitive for the 500-entry `index.md` browse view.
260    pub fn recent_in_type_folder(
261        &self,
262        type_folder: &Path,
263        n: usize,
264    ) -> Result<Vec<PathBuf>, StoreError> {
265        let files = self.walk_type_folder(type_folder)?;
266        // (updated, rel-path) for each file. Files missing/unparseable
267        // `updated` sort *after* dated ones (None last), then by path — so they
268        // are deterministically the lowest-priority candidates for the cap, not
269        // dropped silently. The total order (updated desc, path asc) is what
270        // keeps write-through and rebuild agreeing on #500 vs #501.
271        let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
272            .into_iter()
273            .map(|rel| {
274                let updated = self.read_updated(&self.abs_path(&rel));
275                (updated, rel)
276            })
277            .collect();
278        keyed.sort_by(|a, b| {
279            // `updated` descending: newest first. `None` is treated as the
280            // oldest possible, so dated files always win a cap slot over
281            // undated ones.
282            let by_updated = b.0.cmp(&a.0);
283            by_updated.then_with(|| a.1.cmp(&b.1))
284        });
285        keyed.truncate(n);
286        Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
287    }
288
289    /// The shard/flat predicate: true if the type date-shards, false if it
290    /// stays flat. True for source types and event record types
291    /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
292    /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
293    /// dedup-bounded entity types (`contact`/`company`/`decision`) and `wiki/`.
294    pub fn type_shards(&self, type_: &str) -> bool {
295        // A `DB.md ## Schemas` `### <type>` block with a `shard:` directive is
296        // authoritative — it is the v0.2 generic-model way to declare sharding,
297        // so it overrides the built-in default below (in either direction).
298        if let Some(shard) = self.config.schemas.get(type_).and_then(|s| s.shard) {
299            return shard;
300        }
301        // Built-in default for the example types. Sharding is a property of the
302        // *type*:
303        //  - source types carry a primary date field and shard;
304        //  - event record types track business volume and shard;
305        //  - dedup-bounded entity types and curation-bounded wiki stay flat.
306        // Any type can override this via a `shard:` directive (above).
307        matches!(
308            type_,
309            // source types
310            "email" | "transcript" | "pdf-source"
311            // event record types (canonical)
312            | "expense" | "invoice" | "meeting"
313            // event record types (recognized custom, per the plan)
314            | "order" | "ticket" | "transaction"
315        )
316    }
317
318    /// Compute the canonical write path for a new file. For a sharding type
319    /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
320    /// primary date field (`email.date`, `expense.date`, … fallback `created`)
321    /// under the type folder; flat types and `wiki/` get no shard segment.
322    /// Deterministic + stable: same input → same path, so a record never moves
323    /// once written.
324    pub fn shard_path_for(
325        &self,
326        type_: &str,
327        frontmatter: &Frontmatter,
328        name: &str,
329    ) -> Result<PathBuf, StoreError> {
330        self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
331    }
332
333    /// Like [`Store::shard_path_for`], but compute the path under an explicit,
334    /// caller-resolved type-folder rather than the canonical default. This lets a
335    /// write surface honour an agent-supplied conforming sub-folder — e.g.
336    /// `wiki/projects/`, `wiki/people/`, `wiki/synthesis/` (the SPEC files a
337    /// `wiki-page` under `wiki/<topic>/`, i.e. ANY topic sub-folder, not only the
338    /// `wiki/topics` default) — while still applying date-sharding for sharding
339    /// types. The folder must be a conforming `<layer>/<type-folder>` (2
340    /// components, recognized layer); the caller is responsible for that (see the
341    /// CLI's `resolve_write_path`), so it is taken as given here.
342    ///
343    /// Sharding is still a property of the *type*: a sharding type gets the
344    /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
345    pub fn shard_path_in(
346        &self,
347        folder: &Path,
348        type_: &str,
349        frontmatter: &Frontmatter,
350        name: &str,
351    ) -> Result<PathBuf, StoreError> {
352        let folder = folder.to_path_buf();
353        let filename = ensure_md_extension(name);
354
355        if !self.type_shards(type_) {
356            // Flat type (entity records, wiki, decisions): no shard segment.
357            return Ok(folder.join(filename));
358        }
359
360        // Sharding type: derive <YYYY>/<MM> from the primary date field, with
361        // `created` as the universal fallback. Reading the public `Frontmatter`
362        // fields directly (typed `created`/`updated` + raw `extra`) avoids the
363        // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
364        let (year, month) = self
365            .primary_shard_segment(type_, frontmatter)
366            .ok_or_else(|| StoreError::NoShardDate {
367                file: folder.join(&filename),
368            })?;
369
370        Ok(folder.join(year).join(month).join(filename))
371    }
372
373    /// Find files with an incoming wiki-link to `target`, via **embedded
374    /// ripgrep** for `[[target]]` across all layers. Loop-fast; no whole-graph
375    /// build. Returns store-relative paths.
376    pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
377        // A single target is just the degenerate batch case — one alternation
378        // arm, one store scan. Routing through `find_links_to_any` keeps the
379        // pattern construction and the scan loop in exactly one place. The
380        // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
381        // single alloc on this single-target convenience path; the batch path
382        // validate.rs rides is untouched).
383        self.find_links_to_any(&[target.to_path_buf()])
384    }
385
386    /// Find every file with an incoming wiki-link to **any** of `targets`, in a
387    /// **single embedded-ripgrep pass** over the store (one `.md` walk, one
388    /// presence-only scan per file). This is the batch incoming-linker finder the
389    /// working-set [`crate::validate::validate_working_set`] sits on: it must find
390    /// the linkers for the *whole* changed set without paying a full store read
391    /// per changed object. Cost is therefore one store scan (O(store)), NOT
392    /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
393    /// in a loop would reread every `.md` once per target and is the exact
394    /// `O(changed × store)` blow-up this method exists to prevent. Returns
395    /// store-relative paths (deduped, sorted).
396    ///
397    /// Why content scan and not the sidecar `links` field: the sidecar projects
398    /// only the frontmatter `links:` array, so it misses edges written in the
399    /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
400    /// arbitrary path therefore requires reading file content — the same reason
401    /// the single-target finder uses ripgrep.
402    pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
403        // The wiki-link doctrine: a link is the full store-relative path, no
404        // `.md` extension. A reference to a target therefore appears literally
405        // as `[[<target>]]`, optionally with a `|display` suffix and (warned
406        // but accepted) a trailing `.md`. Build ONE regex that matches all
407        // accepted spellings of an incoming link to ANY target, escaping each
408        // target so path separators / dots stay literal and the alternation
409        // arms keep their boundaries (a link to `sarah` never matches
410        // `sarah-chen`).
411        let mut arms: Vec<String> = Vec::new();
412        for target in targets {
413            let target_str = path_to_link_str(target);
414            if target_str.is_empty() {
415                continue;
416            }
417            // [[ <target> (.md)? ( | display )? ]]
418            arms.push(format!(
419                r"\[\[{}(\.md)?(\|[^\]]*)?\]\]",
420                regex::escape(&target_str)
421            ));
422        }
423        // No usable targets → no possible incoming links, and an empty pattern
424        // would compile to a match-everything regex. Short-circuit instead.
425        if arms.is_empty() {
426            return Ok(Vec::new());
427        }
428        let pattern = arms.join("|");
429
430        let matcher = RegexMatcher::new(&pattern).map_err(|e| StoreError::Search {
431            root: self.root.clone(),
432            message: format!("invalid backlink pattern: {e}"),
433        })?;
434
435        let mut hits = std::collections::BTreeSet::new();
436        // Scan every `.md` file in the store (skip hidden + `log/`), including
437        // `index.md` catalogs — an incoming reference is wherever the literal
438        // link text lives; the caller decides relevance. ONE walk for the whole
439        // target set; per file we stop at the first hit (presence is all we
440        // need), so a file that links to several targets is read once, not once
441        // per target.
442        for rel in self.walk_all_md()? {
443            let abs = self.abs_path(&rel);
444            let mut matched_here = false;
445            let mut searcher = Searcher::new();
446            // `Lossy`, not `UTF8`: a `.md` file verbatim-ingested into
447            // `sources/` can carry a stray non-UTF-8 byte (e.g. a mis-decoded
448            // Latin-1 import). The `UTF8` sink runs `std::str::from_utf8` on
449            // each matched line and returns an `io::Error` on invalid bytes,
450            // which propagated out of `search_path` and aborted the *entire*
451            // store scan for every caller (`graph backlinks`, the working-set
452            // validate incoming-linker pass) — one bad byte on a single
453            // link-bearing line took the whole batch down. `Lossy` substitutes
454            // replacement characters instead of erroring; the closure ignores
455            // the line text entirely (presence is all we need), so the lossy
456            // conversion has no downside and the scan degrades to "still finds
457            // the link" rather than failing hard.
458            let res = searcher.search_path(
459                &matcher,
460                &abs,
461                Lossy(|_lnum, _line| {
462                    matched_here = true;
463                    // Stop at the first hit: presence is all we need.
464                    Ok(false)
465                }),
466            );
467            if let Err(e) = res {
468                return Err(StoreError::Search {
469                    root: self.root.clone(),
470                    message: format!("search failed in {}: {e}", abs.display()),
471                });
472            }
473            if matched_here {
474                hits.insert(rel);
475            }
476        }
477        Ok(hits.into_iter().collect())
478    }
479
480    /// Candidate set for a `type` query: read every type-folder `index.jsonl`
481    /// sidecar in the type's single layer and return the records of that
482    /// `type`. Complete and cold-cache-proof — NOT a walk-and-parse or a
483    /// frontmatter ripgrep scan, and **never a store-wide read**.
484    ///
485    /// The read is bounded to the type's one layer subtree
486    /// (O(entities-in-layer)): a type lives in exactly one layer, and
487    /// `default_type_folder` always encodes it (recognized → its SPEC layer;
488    /// unrecognized → `records/`), so the walk never fans out across every
489    /// sidecar in the store and stays inside the interactive loop's
490    /// O(entities) contract.
491    ///
492    /// The whole-layer read — rather than reading only the type's canonical
493    /// folder sidecar when it happens to exist — is what makes the result
494    /// *complete*. A single `type` can legitimately be filed across several
495    /// folders within its layer: `wiki-page` under `wiki/<topic>/` for any
496    /// topic (SPEC), or a `contact` filed in `records/clients/` alongside the
497    /// canonical `records/contacts/`. The previous code read only the
498    /// canonical-guess sidecar whenever it was a file, which silently dropped
499    /// those non-canonical records the moment the canonical sidecar existed —
500    /// returning an incomplete set, and a *different* set as the store grew
501    /// (the omission flipped on once one canonical record was added). That
502    /// broke the dedup/enumeration premise this primitive backs and disagreed
503    /// with `find_by_where_in`, which already walks the whole layer. Filtering
504    /// the layer read by `type` keeps the result complete regardless of how the
505    /// type's records are foldered.
506    pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
507        let canonical_folder = default_type_folder(type_);
508        let records = self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?;
509        Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
510    }
511
512    /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
513    /// every type-folder `index.jsonl` sidecar and filter their records. The
514    /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
515    /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
516    pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
517        self.find_by_where_in(key, value, None)
518    }
519
520    /// Candidate set for a `key=value` frontmatter query, **scoped to one
521    /// layer** when `layer` is `Some`: the sidecar walk is confined to that
522    /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
523    /// not O(store records). `None` keeps the store-wide read.
524    ///
525    /// This is what makes `--in <layer>` an I/O scope, not just a result
526    /// filter: a `--where`-only query (no `--type`) used to read every sidecar
527    /// in the store and narrow by layer in memory, breaking the O(entities)
528    /// contract the interactive loop depends on. With a layer in hand we walk
529    /// only that layer's sidecars.
530    pub fn find_by_where_in(
531        &self,
532        key: &str,
533        value: &str,
534        layer: Option<Layer>,
535    ) -> Result<Vec<IndexRecord>, StoreError> {
536        // A `key=value` query can target any frontmatter field across any type,
537        // so within the chosen subtree we still read every type-folder sidecar
538        // and filter. The layer (when given) bounds *which* subtree, turning a
539        // whole-store walk into a single-layer walk.
540        let records = self.read_all_type_indexes_in(layer)?;
541        Ok(records
542            .into_iter()
543            .filter(|r| record_matches_field(r, key, value))
544            .collect())
545    }
546
547    /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
548    /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
549    /// else store-wide. Sequential, complete sidecar reads — never a
550    /// walk-and-parse of the content tree.
551    ///
552    /// This is the unfiltered sidecar-enumeration primitive the relationship
553    /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
554    /// candidate set to the relevant layer (or the whole store) without opening
555    /// the content tree, then confirms each candidate's edge by parsing the file.
556    pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
557        self.read_all_type_indexes_in(layer)
558    }
559
560    /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
561    /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
562    /// primitive every structured query sits on.
563    pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
564        let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
565            path: index_jsonl.to_path_buf(),
566            message: e.to_string(),
567        })?;
568
569        // Last-write-wins by `path` over un-compacted lines: a later line for
570        // the same path supersedes an earlier one (the jsonl is append-mostly
571        // and only compacted on rebuild). Blank lines are skipped; a non-blank
572        // line that is not a valid IndexRecord is a hard parse error.
573        let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
574        for (i, line) in text.lines().enumerate() {
575            let trimmed = line.trim();
576            if trimmed.is_empty() {
577                continue;
578            }
579            let record: IndexRecord =
580                serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
581                    path: index_jsonl.to_path_buf(),
582                    message: format!("line {}: {e}", i + 1),
583                })?;
584            by_path.insert(record.path.clone(), record);
585        }
586        // BTreeMap keyed by path → records emerge sorted by path ascending,
587        // a deterministic order independent of line order in the file.
588        Ok(by_path.into_values().collect())
589    }
590
591    /// Resolve a store-relative path to its absolute on-disk path under
592    /// [`root`](Store::root).
593    pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
594        // `Path::join` returns `store_relative` unchanged if it is already
595        // absolute, so passing an absolute path through is a no-op.
596        self.root.join(store_relative)
597    }
598
599    /// Convert an absolute path under the store into its store-relative form.
600    pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
601        abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
602    }
603
604    // ── Private helpers ─────────────────────────────────────────────────────
605
606    /// Resolve a caller-supplied folder path (store-relative or absolute) to an
607    /// absolute path under the store root.
608    fn resolve_under_root(&self, folder: &Path) -> PathBuf {
609        if folder.is_absolute() {
610            folder.to_path_buf()
611        } else {
612            self.root.join(folder)
613        }
614    }
615
616    /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
617    /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
618    /// and type-folder walks.
619    fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
620        let mut out = Vec::new();
621        for entry in self.md_walker(root).build() {
622            let entry = entry.map_err(|e| StoreError::Search {
623                root: root.to_path_buf(),
624                message: e.to_string(),
625            })?;
626            if !is_file_entry(&entry) {
627                continue;
628            }
629            let path = entry.path();
630            if !has_md_extension(path) {
631                continue;
632            }
633            if is_non_content_basename(path) {
634                continue;
635            }
636            if let Some(rel) = self.rel_path(path) {
637                out.push(rel);
638            }
639        }
640        out.sort();
641        Ok(out)
642    }
643
644    /// Walk the whole store for **every** `.md` file (including `index.md`),
645    /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
646    /// scan, where the literal link text can live in any markdown file.
647    fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
648        let mut out = Vec::new();
649        for entry in self.md_walker(&self.root).build() {
650            let entry = entry.map_err(|e| StoreError::Search {
651                root: self.root.clone(),
652                message: e.to_string(),
653            })?;
654            if !is_file_entry(&entry) {
655                continue;
656            }
657            let path = entry.path();
658            if !has_md_extension(path) {
659                continue;
660            }
661            if self.is_in_log_dir(path) {
662                continue;
663            }
664            if let Some(rel) = self.rel_path(path) {
665                out.push(rel);
666            }
667        }
668        out.sort();
669        Ok(out)
670    }
671
672    /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
673    /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
674    /// read with last-write-wins by path; across sidecars, paths are disjoint by
675    /// construction (one sidecar per folder), so a plain concatenation preserves
676    /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
677    /// is what keeps `find_by_where_in` O(entities-in-layer).
678    fn read_all_type_indexes_in(
679        &self,
680        layer: Option<Layer>,
681    ) -> Result<Vec<IndexRecord>, StoreError> {
682        let mut out = Vec::new();
683        for sidecar in self.find_type_index_files_in(layer)? {
684            out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
685        }
686        Ok(out)
687    }
688
689    /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
690    /// whole store (skip hidden + `log/`), returning store-relative paths. The
691    /// walk root is `<root>/<layer>/` for a scoped read and `self.root` for the
692    /// store-wide read; a non-existent layer subtree yields no sidecars rather
693    /// than walking a missing path.
694    fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
695        let walk_root = match layer {
696            Some(l) => self.root.join(l.dir_name()),
697            None => self.root.clone(),
698        };
699        // A scoped walk over a layer folder that does not exist yet must be an
700        // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
701        // error from `ignore` over a nonexistent path.
702        if !walk_root.is_dir() {
703            return Ok(Vec::new());
704        }
705        let mut out = Vec::new();
706        let mut builder = WalkBuilder::new(&walk_root);
707        builder.standard_filters(false).hidden(true);
708        for entry in builder.build() {
709            let entry = entry.map_err(|e| StoreError::Search {
710                root: walk_root.clone(),
711                message: e.to_string(),
712            })?;
713            if !is_file_entry(&entry) {
714                continue;
715            }
716            let path = entry.path();
717            if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
718                continue;
719            }
720            if self.is_in_log_dir(path) {
721                continue;
722            }
723            if let Some(rel) = self.rel_path(path) {
724                out.push(rel);
725            }
726        }
727        out.sort();
728        Ok(out)
729    }
730
731    /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
732    /// OFF (a SWEEP must see every file even if the store is a git repo with a
733    /// `.gitignore`), but hidden files/dirs are skipped.
734    fn md_walker(&self, root: &Path) -> WalkBuilder {
735        let mut builder = WalkBuilder::new(root);
736        builder.standard_filters(false).hidden(true);
737        builder
738    }
739
740    /// True if an absolute path lives under the store's root-level `log/`
741    /// rotation-archive directory.
742    fn is_in_log_dir(&self, abs: &Path) -> bool {
743        match self.rel_path(abs) {
744            Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
745            None => false,
746        }
747    }
748
749    /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
750    /// returning `None` when absent/unparseable. A self-contained reader (does
751    /// not depend on the not-yet-implemented `parser::read_file`); parses the
752    /// leading `---`-fenced YAML block with the same engine the parser uses.
753    fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
754        let text = std::fs::read_to_string(abs).ok()?;
755        let yaml = frontmatter_block(&text)?;
756        let value: serde_norway::Value = serde_norway::from_str(yaml).ok()?;
757        let raw = value.get("updated")?;
758        value_to_datetime(raw)
759    }
760
761    /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
762    /// date field with a `created` fallback. Reads the public `Frontmatter`
763    /// fields directly. `None` when no usable date is present.
764    fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
765        // Try the type's primary date field first.
766        if let Some(field) = primary_date_field(type_) {
767            if let Some(v) = fm.extra.get(field) {
768                if let Some(seg) = value_to_year_month(v) {
769                    return Some(seg);
770                }
771            }
772        }
773        // Universal fallback: the typed `created` timestamp.
774        fm.created
775            .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
776    }
777}
778
779// ── Free helpers (no `self`) ────────────────────────────────────────────────
780
781/// True if a walk entry is a regular file (not a dir / symlink-to-dir).
782fn is_file_entry(entry: &ignore::DirEntry) -> bool {
783    entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
784}
785
786/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
787/// lowercase `.md`).
788fn has_md_extension(path: &Path) -> bool {
789    path.extension().and_then(|e| e.to_str()) == Some("md")
790}
791
792/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
793/// `log.md`) that the content walks must skip.
794fn is_non_content_basename(path: &Path) -> bool {
795    match path.file_name().and_then(|n| n.to_str()) {
796        Some(name) => NON_CONTENT_BASENAMES.contains(&name),
797        None => false,
798    }
799}
800
801/// Append `.md` to a bare name; leave an existing `.md` untouched.
802fn ensure_md_extension(name: &str) -> String {
803    if name.ends_with(".md") {
804        name.to_string()
805    } else {
806        format!("{name}.md")
807    }
808}
809
810/// Render a store-relative path as a wiki-link target string with `/`
811/// separators (never `\`), no leading `./`, no trailing `.md`.
812fn path_to_link_str(target: &Path) -> String {
813    let mut parts: Vec<String> = Vec::new();
814    for comp in target.components() {
815        if let std::path::Component::Normal(os) = comp {
816            if let Some(s) = os.to_str() {
817                parts.push(s.to_string());
818            }
819        }
820    }
821    let mut joined = parts.join("/");
822    if let Some(stripped) = joined.strip_suffix(".md") {
823        joined = stripped.to_string();
824    }
825    joined
826}
827
828/// The canonical default folder for a recognized type, per the SPEC type table
829/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
830/// types fall back to `records/<type>` (the bare type name, no pluralization
831/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
832fn default_type_folder(type_: &str) -> PathBuf {
833    let path = match type_ {
834        // sources
835        "email" => "sources/emails",
836        "transcript" => "sources/transcripts",
837        "pdf-source" => "sources/docs",
838        // records — entities
839        "contact" => "records/contacts",
840        "company" => "records/companies",
841        // records — events
842        "expense" => "records/expenses",
843        "meeting" => "records/meetings",
844        "decision" => "records/decisions",
845        "invoice" => "records/invoices",
846        // wiki — the SPEC type table files a wiki-page under `wiki/<topic>/`,
847        // i.e. ALWAYS a sub-folder, never flat under `wiki/`. A 2-component
848        // `wiki/<file>` path is non-conforming: `index::type_folder_of` /
849        // `validate::type_folder_of` require `<layer>/<type-folder>/<file>` (3
850        // components), so a flat wiki page either crashes write-through
851        // (`on_write` tries to create `index.md` *inside* a file) or is silently
852        // dropped from every catalog by `rebuild_all`. `topic` is the page's
853        // canonical bucket; with only the bare type in hand here, `wiki/topics`
854        // is the deterministic default folder (matches the dogfood store).
855        "wiki-page" => "wiki/topics",
856        // unrecognized: bare type name under records/
857        other => return PathBuf::from("records").join(other),
858    };
859    PathBuf::from(path)
860}
861
862/// The canonical [`Layer`] a `type_` belongs to, derived from its default
863/// type-folder (`email` → `Sources`, `contact` → `Records`, `wiki-page` →
864/// `Wiki`, unrecognized → `Records`). The write path uses this to decide whether
865/// an agent-supplied folder is in the *right* layer for the type before honouring
866/// its sub-folder choice.
867pub fn layer_for_type(type_: &str) -> Layer {
868    layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
869}
870
871/// The [`Layer`] a type-folder path lives in, read from its first component
872/// (`sources/` → `Sources`, `records/` → `Records`, `wiki/` → `Wiki`). Used to
873/// bound [`Store::find_by_type`]'s whole-layer sidecar read to a single layer
874/// subtree. Returns `None` for a path with no recognized layer prefix; every
875/// value [`default_type_folder`] produces has one, so in practice this is
876/// always `Some` on the call path — `None` degrades to a store-wide read.
877fn layer_of_folder(folder: &Path) -> Option<Layer> {
878    let first = folder.components().next()?.as_os_str().to_str()?;
879    Layer::from_dir_name(first)
880}
881
882/// Infer a content file's canonical `type` from its store-relative path — the
883/// inverse of [`default_type_folder`] and the single source of truth for
884/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
885///
886/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
887/// shorter path (a file directly under a layer) or an unknown leading layer
888/// yields `None`.
889///
890/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
891/// unrecognized folder the fallback is the **bare folder name verbatim** (no
892/// pluralization/singularization) so it round-trips with `default_type_folder`,
893/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
894/// Singularizing here would break that round-trip (`records/tasks` → `task`
895/// while `default_type_folder("task")` → `records/task`). `wiki/<topic>` always
896/// infers `wiki-page`, since every wiki page is filed under a topic folder.
897pub fn infer_type_from_path(rel: &Path) -> Option<String> {
898    let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
899    let layer = comps.next()?;
900    if !matches!(layer, "sources" | "records" | "wiki") {
901        return None;
902    }
903    let folder = comps.next()?;
904    // The file itself must be a third component (a real type-folder, not the
905    // file sitting directly under the layer).
906    comps.next()?;
907
908    let mapped = match (layer, folder) {
909        ("sources", "emails") => "email",
910        ("sources", "transcripts") => "transcript",
911        ("sources", "docs") => "pdf-source",
912        ("records", "contacts") => "contact",
913        ("records", "companies") => "company",
914        ("records", "expenses") => "expense",
915        ("records", "meetings") => "meeting",
916        ("records", "decisions") => "decision",
917        ("records", "invoices") => "invoice",
918        // Every wiki page is filed under `wiki/<topic>/`; the type is always
919        // `wiki-page` regardless of the topic-folder name.
920        ("wiki", _) => "wiki-page",
921        // Unrecognized folder: the bare name, verbatim. This is the inverse of
922        // `default_type_folder`'s unrecognized fallback (`other → records/other`)
923        // and the round-trip would break if we pluralized/singularized here.
924        (_, other) => other,
925    };
926    Some(mapped.to_string())
927}
928
929/// The primary date field name for a sharding type (the field whose value
930/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
931fn primary_date_field(type_: &str) -> Option<&'static str> {
932    match type_ {
933        "email" => Some("date"),
934        "transcript" => Some("recorded_at"),
935        "pdf-source" => Some("received_at"),
936        "expense" | "invoice" | "meeting" => Some("date"),
937        // recognized custom event types have no canonical date field name; they
938        // fall back to `created`.
939        _ => None,
940    }
941}
942
943/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
944/// string and a YAML-native scalar rendered to string.
945fn value_to_datetime(value: &serde_norway::Value) -> Option<DateTime<FixedOffset>> {
946    let s = yaml_scalar_string(value)?;
947    DateTime::parse_from_rfc3339(s.trim()).ok()
948}
949
950/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
951/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
952/// `2026-05-22T10:00:00-07:00` timestamp both work.
953fn value_to_year_month(value: &serde_norway::Value) -> Option<(String, String)> {
954    let s = yaml_scalar_string(value)?;
955    year_month_from_str(s.trim())
956}
957
958/// `(YYYY, MM)` from the leading `YYYY-MM` of a date string.
959fn year_month_from_str(s: &str) -> Option<(String, String)> {
960    // Hand-roll the leading-`YYYY-MM` parse to avoid a regex compile on the
961    // write path. Require: 4 digits, '-', 2 digits.
962    let bytes = s.as_bytes();
963    if bytes.len() < 7 {
964        return None;
965    }
966    let is_digit = |b: u8| b.is_ascii_digit();
967    if !(is_digit(bytes[0])
968        && is_digit(bytes[1])
969        && is_digit(bytes[2])
970        && is_digit(bytes[3])
971        && bytes[4] == b'-'
972        && is_digit(bytes[5])
973        && is_digit(bytes[6]))
974    {
975        return None;
976    }
977    let month: u8 = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
978    if !(1..=12).contains(&month) {
979        return None;
980    }
981    Some((s[0..4].to_string(), s[5..7].to_string()))
982}
983
984/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
985/// value's compact YAML serialization (covers timestamps that the YAML engine
986/// may surface as a non-string scalar).
987fn yaml_scalar_string(value: &serde_norway::Value) -> Option<String> {
988    if let Some(s) = value.as_str() {
989        return Some(s.to_string());
990    }
991    match value {
992        serde_norway::Value::Null => None,
993        serde_norway::Value::Mapping(_) | serde_norway::Value::Sequence(_) => None,
994        other => serde_norway::to_string(other)
995            .ok()
996            .map(|s| s.trim().to_string()),
997    }
998}
999
1000/// The YAML frontmatter block of a file: the text between a leading `---` fence
1001/// and the next `---` fence, exclusive. `None` if the file does not open with a
1002/// `---` fence on its first line.
1003fn frontmatter_block(text: &str) -> Option<&str> {
1004    // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
1005    let body = text.strip_prefix('\u{feff}').unwrap_or(text);
1006    let mut rest = body;
1007    // First line must be exactly `---` (allowing trailing CR).
1008    let (first, after_first) = split_first_line(rest);
1009    if first.trim_end_matches('\r') != "---" {
1010        return None;
1011    }
1012    rest = after_first;
1013    let block_start = rest;
1014    let mut scanned = 0usize;
1015    loop {
1016        let (line, after) = split_first_line(rest);
1017        if line.trim_end_matches('\r') == "---" {
1018            return Some(&block_start[..scanned]);
1019        }
1020        if after.is_empty() && line.is_empty() {
1021            // Reached end of input without a closing fence.
1022            return None;
1023        }
1024        scanned += line.len() + 1; // +1 for the consumed '\n'
1025        if after.is_empty() {
1026            return None;
1027        }
1028        rest = after;
1029    }
1030}
1031
1032/// Split a string into (first line without its trailing `\n`, remainder after
1033/// the `\n`). If there is no newline, the whole string is the line and the
1034/// remainder is empty.
1035fn split_first_line(s: &str) -> (&str, &str) {
1036    match s.find('\n') {
1037        Some(i) => (&s[..i], &s[i + 1..]),
1038        None => (s, ""),
1039    }
1040}
1041
1042/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
1043/// typed columns first and then the flattened `fields` map.
1044fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
1045    match key {
1046        "type" => record.type_ == value,
1047        "summary" => record.summary == value,
1048        "path" => record.path.to_string_lossy() == value,
1049        "created" => timestamp_matches(record.created, value),
1050        "updated" => timestamp_matches(record.updated, value),
1051        "tags" => record.tags.iter().any(|t| t == value),
1052        "links" => record.links.iter().any(|l| l == value),
1053        other => record
1054            .fields
1055            .get(other)
1056            .map(|v| json_value_matches(v, value))
1057            .unwrap_or(false),
1058    }
1059}
1060
1061/// Compare a record's `created`/`updated` instant against a query `value`.
1062///
1063/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1064/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1065/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1066/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1067/// the `Z` form an agent reads straight out of the file would never match. We
1068/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1069/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1070/// can never equal a real timestamp, so it falls through to `false`.
1071fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1072    match (stored, DateTime::parse_from_rfc3339(value)) {
1073        (Some(stored), Ok(queried)) => stored == queried,
1074        _ => false,
1075    }
1076}
1077
1078/// Compare a JSON field value against a query string. A string matches
1079/// verbatim; scalars match their textual form; an array matches if any element
1080/// matches (so a list-valued frontmatter field is membership-queried).
1081fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1082    match v {
1083        serde_json::Value::String(s) => s == value,
1084        serde_json::Value::Bool(b) => b.to_string() == value,
1085        serde_json::Value::Number(n) => n.to_string() == value,
1086        serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1087        // A present-but-null field never matches — consistent with the in-memory
1088        // post-filter (`query::json_value_matches`, which the first `where`
1089        // clause is NOT re-checked against, so the two must agree here or a
1090        // `--where field=` query would return different rows than `--type X
1091        // --where field=`).
1092        serde_json::Value::Null => false,
1093        serde_json::Value::Object(_) => false,
1094    }
1095}
1096
1097#[cfg(test)]
1098mod tests {
1099    use super::*;
1100    use std::fs;
1101    use tempfile::{tempdir, TempDir};
1102
1103    // ── Fixtures ────────────────────────────────────────────────────────────
1104
1105    /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1106    /// store-relative path for convenient assertions.
1107    fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1108        let abs = root.join(rel);
1109        fs::create_dir_all(abs.parent().unwrap()).unwrap();
1110        fs::write(&abs, contents).unwrap();
1111        PathBuf::from(rel)
1112    }
1113
1114    /// A minimal content file with the given `updated` timestamp in frontmatter.
1115    fn content_md(updated: &str) -> String {
1116        format!(
1117            "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1118        )
1119    }
1120
1121    /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1122    /// real parser is exercised).
1123    fn empty_store() -> TempDir {
1124        let dir = tempdir().unwrap();
1125        fs::write(
1126            dir.path().join("DB.md"),
1127            "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1128        )
1129        .unwrap();
1130        dir
1131    }
1132
1133    /// Open a store rooted at a TempDir; panics if `open` rejects it.
1134    fn open(dir: &TempDir) -> Store {
1135        Store::open(dir.path()).expect("fixture should be a valid store")
1136    }
1137
1138    fn rels(paths: &[PathBuf]) -> Vec<String> {
1139        paths
1140            .iter()
1141            .map(|p| p.to_string_lossy().replace('\\', "/"))
1142            .collect()
1143    }
1144
1145    // ── Layer ───────────────────────────────────────────────────────────────
1146
1147    #[test]
1148    fn layer_dir_name_and_parse_are_inverse() {
1149        for layer in Layer::all() {
1150            assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1151        }
1152        assert_eq!(Layer::Sources.dir_name(), "sources");
1153        assert_eq!(Layer::Records.dir_name(), "records");
1154        assert_eq!(Layer::Wiki.dir_name(), "wiki");
1155        assert_eq!(Layer::from_dir_name("log"), None);
1156        assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1157    }
1158
1159    #[test]
1160    fn layer_order_is_canonical() {
1161        // stats keys a BTreeMap on Layer; the sort order must be sources<records<wiki.
1162        let mut v = [Layer::Wiki, Layer::Sources, Layer::Records];
1163        v.sort();
1164        assert_eq!(v, [Layer::Sources, Layer::Records, Layer::Wiki]);
1165    }
1166
1167    // ── is_db_md_store / open ────────────────────────────────────────────────
1168
1169    #[test]
1170    fn is_store_true_only_with_uppercase_marker() {
1171        let dir = tempdir().unwrap();
1172        assert!(
1173            !Store::is_db_md_store(dir.path()),
1174            "no marker → not a store"
1175        );
1176
1177        fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1178        assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1179    }
1180
1181    #[test]
1182    fn is_store_false_for_lowercase_db_md() {
1183        // The case-sensitivity contract: a lowercase db.md is the spec name, not
1184        // a marker — even on a case-insensitive filesystem where Path::exists
1185        // would lie. This test must pass on macOS (case-insensitive) too.
1186        let dir = tempdir().unwrap();
1187        fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1188        assert!(
1189            !Store::is_db_md_store(dir.path()),
1190            "lowercase db.md must NOT be treated as a store marker"
1191        );
1192        assert!(Store::open(dir.path()).is_err());
1193    }
1194
1195    #[test]
1196    fn is_store_false_when_db_md_is_a_directory() {
1197        let dir = tempdir().unwrap();
1198        fs::create_dir(dir.path().join("DB.md")).unwrap();
1199        assert!(
1200            !Store::is_db_md_store(dir.path()),
1201            "a directory named DB.md is not the file marker"
1202        );
1203    }
1204
1205    #[test]
1206    fn open_rejects_non_store_with_path() {
1207        let dir = tempdir().unwrap();
1208        let err = Store::open(dir.path()).unwrap_err();
1209        assert_eq!(err.path, dir.path());
1210    }
1211
1212    #[test]
1213    fn open_succeeds_and_parses_config() {
1214        let dir = tempdir().unwrap();
1215        // A DB.md whose ## Policies declares a frozen page — proves open()
1216        // actually parsed the config rather than substituting a default.
1217        fs::write(
1218            dir.path().join("DB.md"),
1219            "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1220             ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1221        )
1222        .unwrap();
1223        let store = Store::open(dir.path()).unwrap();
1224        assert_eq!(store.root, dir.path());
1225        assert!(
1226            store
1227                .config
1228                .frozen_pages
1229                .iter()
1230                .any(|p| p == Path::new("records/decisions/q1.md")),
1231            "open() must surface DB.md ## Policies, got {:?}",
1232            store.config.frozen_pages
1233        );
1234    }
1235
1236    // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1237
1238    #[test]
1239    fn walk_collects_content_across_layers_skipping_meta_and_log() {
1240        let dir = empty_store();
1241        let root = dir.path();
1242        write(
1243            root,
1244            "sources/emails/2026/05/a.md",
1245            &content_md("2026-05-01T00:00:00Z"),
1246        );
1247        write(
1248            root,
1249            "records/contacts/sarah.md",
1250            &content_md("2026-05-02T00:00:00Z"),
1251        );
1252        write(
1253            root,
1254            "wiki/people/sarah.md",
1255            &content_md("2026-05-03T00:00:00Z"),
1256        );
1257        // Things walk() must SKIP:
1258        write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1259        write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1260        write(root, "log.md", "---\ntype: log\n---\n"); // log
1261        write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1262        write(
1263            root,
1264            "sources/.hidden/secret.md",
1265            &content_md("2026-05-09T00:00:00Z"),
1266        ); // hidden dir
1267        write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1268
1269        let store = open(&dir);
1270        let got = rels(&store.walk().unwrap());
1271        assert_eq!(
1272            got,
1273            vec![
1274                "records/contacts/sarah.md".to_string(),
1275                "sources/emails/2026/05/a.md".to_string(),
1276                "wiki/people/sarah.md".to_string(),
1277            ]
1278        );
1279    }
1280
1281    #[test]
1282    fn walk_includes_content_named_log_md_or_db_md_inside_a_layer() {
1283        let dir = empty_store();
1284        let root = dir.path();
1285        // A content file that merely happens to be named log.md / DB.md INSIDE a
1286        // layer is real content — those names are reserved only at the store root.
1287        write(
1288            root,
1289            "records/configs/log.md",
1290            &content_md("2026-05-01T00:00:00Z"),
1291        );
1292        write(
1293            root,
1294            "sources/docs/DB.md",
1295            &content_md("2026-05-02T00:00:00Z"),
1296        );
1297        // The derived catalog twin is still skipped at any depth.
1298        write(root, "records/configs/index.md", "---\ntype: index\n---\n");
1299        let store = open(&dir);
1300        let got = rels(&store.walk().unwrap());
1301        assert!(
1302            got.contains(&"records/configs/log.md".to_string()),
1303            "layer-internal log.md is content: {got:?}"
1304        );
1305        assert!(
1306            got.contains(&"sources/docs/DB.md".to_string()),
1307            "layer-internal DB.md is content: {got:?}"
1308        );
1309        assert!(
1310            !got.iter().any(|p| p.ends_with("index.md")),
1311            "index.md is still skipped: {got:?}"
1312        );
1313    }
1314
1315    #[test]
1316    fn walk_layer_is_scoped() {
1317        let dir = empty_store();
1318        let root = dir.path();
1319        write(
1320            root,
1321            "sources/emails/2026/05/a.md",
1322            &content_md("2026-05-01T00:00:00Z"),
1323        );
1324        write(
1325            root,
1326            "records/contacts/sarah.md",
1327            &content_md("2026-05-02T00:00:00Z"),
1328        );
1329        let store = open(&dir);
1330
1331        assert_eq!(
1332            rels(&store.walk_layer(Layer::Sources).unwrap()),
1333            vec!["sources/emails/2026/05/a.md".to_string()]
1334        );
1335        assert_eq!(
1336            rels(&store.walk_layer(Layer::Records).unwrap()),
1337            vec!["records/contacts/sarah.md".to_string()]
1338        );
1339        // A layer with no directory is empty, not an error.
1340        assert!(store.walk_layer(Layer::Wiki).unwrap().is_empty());
1341    }
1342
1343    #[test]
1344    fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1345        let dir = empty_store();
1346        let root = dir.path();
1347        write(
1348            root,
1349            "sources/emails/2026/05/a.md",
1350            &content_md("2026-05-01T00:00:00Z"),
1351        );
1352        write(
1353            root,
1354            "sources/emails/2026/06/b.md",
1355            &content_md("2026-06-01T00:00:00Z"),
1356        );
1357        write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1358                                                                           // A different type folder must not leak in.
1359        write(
1360            root,
1361            "sources/docs/2026/05/c.md",
1362            &content_md("2026-05-04T00:00:00Z"),
1363        );
1364        let store = open(&dir);
1365
1366        let expected = vec![
1367            "sources/emails/2026/05/a.md".to_string(),
1368            "sources/emails/2026/06/b.md".to_string(),
1369        ];
1370        // Relative folder arg.
1371        assert_eq!(
1372            rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1373            expected
1374        );
1375        // Absolute folder arg under the store resolves identically.
1376        assert_eq!(
1377            rels(
1378                &store
1379                    .walk_type_folder(&root.join("sources/emails"))
1380                    .unwrap()
1381            ),
1382            expected
1383        );
1384    }
1385
1386    // ── recent_in_type_folder ────────────────────────────────────────────────
1387
1388    #[test]
1389    fn recent_orders_by_updated_desc_then_path_and_caps() {
1390        let dir = empty_store();
1391        let root = dir.path();
1392        // newest
1393        write(
1394            root,
1395            "records/meetings/2026/05/c.md",
1396            &content_md("2026-05-03T00:00:00Z"),
1397        );
1398        // tie on updated — path asc decides (a before b)
1399        write(
1400            root,
1401            "records/meetings/2026/05/a.md",
1402            &content_md("2026-05-02T00:00:00Z"),
1403        );
1404        write(
1405            root,
1406            "records/meetings/2026/05/b.md",
1407            &content_md("2026-05-02T00:00:00Z"),
1408        );
1409        // oldest
1410        write(
1411            root,
1412            "records/meetings/2026/04/z.md",
1413            &content_md("2026-04-01T00:00:00Z"),
1414        );
1415        let store = open(&dir);
1416
1417        let all = rels(
1418            &store
1419                .recent_in_type_folder(Path::new("records/meetings"), 10)
1420                .unwrap(),
1421        );
1422        assert_eq!(
1423            all,
1424            vec![
1425                "records/meetings/2026/05/c.md".to_string(), // newest
1426                "records/meetings/2026/05/a.md".to_string(), // tie, path asc
1427                "records/meetings/2026/05/b.md".to_string(),
1428                "records/meetings/2026/04/z.md".to_string(), // oldest
1429            ]
1430        );
1431
1432        // Cap takes the n most-recent.
1433        let top2 = rels(
1434            &store
1435                .recent_in_type_folder(Path::new("records/meetings"), 2)
1436                .unwrap(),
1437        );
1438        assert_eq!(
1439            top2,
1440            vec![
1441                "records/meetings/2026/05/c.md".to_string(),
1442                "records/meetings/2026/05/a.md".to_string(),
1443            ]
1444        );
1445    }
1446
1447    #[test]
1448    fn recent_sorts_undated_files_last() {
1449        let dir = empty_store();
1450        let root = dir.path();
1451        write(
1452            root,
1453            "records/contacts/dated.md",
1454            &content_md("2026-05-01T00:00:00Z"),
1455        );
1456        // No `updated` field at all.
1457        write(
1458            root,
1459            "records/contacts/undated.md",
1460            "---\ntype: contact\nsummary: x\n---\nbody\n",
1461        );
1462        let store = open(&dir);
1463        let got = rels(
1464            &store
1465                .recent_in_type_folder(Path::new("records/contacts"), 10)
1466                .unwrap(),
1467        );
1468        assert_eq!(
1469            got,
1470            vec![
1471                "records/contacts/dated.md".to_string(),
1472                "records/contacts/undated.md".to_string(),
1473            ],
1474            "a file with a real `updated` must outrank one with none"
1475        );
1476    }
1477
1478    // ── type_shards ──────────────────────────────────────────────────────────
1479
1480    #[test]
1481    fn type_shards_classification() {
1482        let dir = empty_store();
1483        let store = open(&dir);
1484        for t in [
1485            "email",
1486            "transcript",
1487            "pdf-source",
1488            "expense",
1489            "invoice",
1490            "meeting",
1491            "order",
1492            "ticket",
1493            "transaction",
1494        ] {
1495            assert!(store.type_shards(t), "{t} should shard");
1496        }
1497        for t in [
1498            "contact",
1499            "company",
1500            "decision",
1501            "wiki-page",
1502            "index",
1503            "log",
1504            "db-md",
1505            "proposal",
1506        ] {
1507            assert!(!store.type_shards(t), "{t} should stay flat");
1508        }
1509    }
1510
1511    #[test]
1512    fn type_shards_respects_schema_directive_both_directions() {
1513        use crate::parser::{Config, Schema};
1514        let dir = empty_store();
1515        let mut store = open(&dir);
1516        let mut config = Config::default();
1517        // A CUSTOM type (not in the built-in list) opts into date-sharding —
1518        // without the schema override `type_shards` would return false for it.
1519        config.schemas.insert(
1520            "shipment".to_string(),
1521            Schema {
1522                shard: Some(true),
1523                ..Schema::default()
1524            },
1525        );
1526        // A BUILT-IN event type opts OUT (flat) — the override wins over the
1527        // built-in default.
1528        config.schemas.insert(
1529            "expense".to_string(),
1530            Schema {
1531                shard: Some(false),
1532                ..Schema::default()
1533            },
1534        );
1535        // A schema with no `shard:` directive leaves the built-in default intact.
1536        config
1537            .schemas
1538            .insert("meeting".to_string(), Schema::default());
1539        store.config = config;
1540
1541        assert!(
1542            store.type_shards("shipment"),
1543            "custom type with `shard: by-date` must shard"
1544        );
1545        assert!(
1546            !store.type_shards("expense"),
1547            "built-in event type with `shard: flat` must go flat"
1548        );
1549        assert!(
1550            store.type_shards("meeting"),
1551            "schema without a `shard:` directive keeps the built-in default"
1552        );
1553        assert!(
1554            !store.type_shards("contact"),
1555            "unconfigured entity type stays flat"
1556        );
1557    }
1558
1559    // ── shard_path_for ───────────────────────────────────────────────────────
1560
1561    fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
1562        let mut fm = Frontmatter::default();
1563        fm.extra.insert(
1564            key.to_string(),
1565            serde_norway::Value::String(value.to_string()),
1566        );
1567        fm
1568    }
1569
1570    fn fm_with_created(rfc3339: &str) -> Frontmatter {
1571        Frontmatter {
1572            created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
1573            ..Default::default()
1574        }
1575    }
1576
1577    #[test]
1578    fn shard_path_uses_primary_date_field_per_type() {
1579        let dir = empty_store();
1580        let store = open(&dir);
1581
1582        // expense.date → records/expenses/<YYYY>/<MM>/
1583        let p = store
1584            .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
1585            .unwrap();
1586        assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
1587
1588        // email.date → sources/emails/<YYYY>/<MM>/
1589        let p = store
1590            .shard_path_for(
1591                "email",
1592                &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
1593                "e1",
1594            )
1595            .unwrap();
1596        assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
1597
1598        // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
1599        let p = store
1600            .shard_path_for(
1601                "transcript",
1602                &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
1603                "t1",
1604            )
1605            .unwrap();
1606        assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
1607    }
1608
1609    #[test]
1610    fn shard_path_falls_back_to_created() {
1611        let dir = empty_store();
1612        let store = open(&dir);
1613        // meeting with no `date` field but a `created` timestamp.
1614        let p = store
1615            .shard_path_for(
1616                "meeting",
1617                &fm_with_created("2024-07-09T08:30:00-04:00"),
1618                "sync",
1619            )
1620            .unwrap();
1621        assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
1622    }
1623
1624    #[test]
1625    fn shard_path_primary_field_wins_over_created() {
1626        let dir = empty_store();
1627        let store = open(&dir);
1628        let mut fm = fm_with_created("2020-01-01T00:00:00Z");
1629        fm.extra.insert(
1630            "date".into(),
1631            serde_norway::Value::String("2026-05-22".into()),
1632        );
1633        let p = store.shard_path_for("expense", &fm, "x").unwrap();
1634        // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
1635        assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
1636    }
1637
1638    #[test]
1639    fn shard_path_flat_types_have_no_shard_segment() {
1640        let dir = empty_store();
1641        let store = open(&dir);
1642        // A contact has a `created` date, but contacts stay flat.
1643        let p = store
1644            .shard_path_for(
1645                "contact",
1646                &fm_with_created("2026-05-22T00:00:00Z"),
1647                "sarah-chen",
1648            )
1649            .unwrap();
1650        assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
1651
1652        // wiki-page is flat (no date shard) but still files under a type-folder:
1653        // `wiki/topics/<name>.md`, NEVER flat as `wiki/<name>.md`. A 2-component
1654        // path is invisible to the index/validate type-folder model.
1655        let p = store
1656            .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1657            .unwrap();
1658        assert_eq!(p, PathBuf::from("wiki/topics/renewal-theme.md"));
1659    }
1660
1661    /// Regression: a wiki-page written through the toolkit's own path
1662    /// computation must land at a path the index + validate type-folder model
1663    /// accepts. `shard_path_for("wiki-page", …)` previously returned a
1664    /// 2-component `wiki/<file>` path, which `type_folder_of` (in both `index`
1665    /// and `validate`) treats as "no type-folder" — so the page either crashed
1666    /// `Index::on_write` (it tried to create `index.md` inside a file) or was
1667    /// silently dropped from every catalog by `Index::rebuild_all`. The
1668    /// computed path must have 3 components: `<layer>/<type-folder>/<file>`.
1669    #[test]
1670    fn shard_path_wiki_page_is_indexable_three_component_path() {
1671        let dir = empty_store();
1672        let store = open(&dir);
1673        let p = store
1674            .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1675            .unwrap();
1676        // First two components are a layer + a non-empty type-folder segment;
1677        // the file is the third. This is exactly the shape `type_folder_of`
1678        // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
1679        let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
1680        assert_eq!(
1681            comps.len(),
1682            3,
1683            "wiki-page path must be <layer>/<type-folder>/<file>, got {p:?}"
1684        );
1685        assert_eq!(comps[0], "wiki", "first component must be the wiki layer");
1686        assert!(
1687            !comps[1].is_empty() && comps[1] != "renewal-theme.md",
1688            "second component must be a real type-folder, not the file: {p:?}"
1689        );
1690        assert!(
1691            comps[2].ends_with(".md"),
1692            "third component must be the .md file: {p:?}"
1693        );
1694    }
1695
1696    #[test]
1697    fn shard_path_preserves_and_adds_md_extension() {
1698        let dir = empty_store();
1699        let store = open(&dir);
1700        let with = store
1701            .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
1702            .unwrap();
1703        let without = store
1704            .shard_path_for("contact", &Frontmatter::default(), "sarah")
1705            .unwrap();
1706        assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
1707        assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
1708    }
1709
1710    #[test]
1711    fn shard_path_errors_when_sharding_type_has_no_date() {
1712        let dir = empty_store();
1713        let store = open(&dir);
1714        // expense shards, but no `date` and no `created` → NoShardDate.
1715        let err = store
1716            .shard_path_for("expense", &Frontmatter::default(), "mystery")
1717            .unwrap_err();
1718        match err {
1719            StoreError::NoShardDate { file } => {
1720                assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
1721            }
1722            other => panic!("expected NoShardDate, got {other:?}"),
1723        }
1724    }
1725
1726    // ── find_links_to ────────────────────────────────────────────────────────
1727
1728    #[test]
1729    fn find_links_to_matches_all_accepted_spellings() {
1730        let dir = empty_store();
1731        let root = dir.path();
1732        let target = "records/contacts/sarah-chen";
1733
1734        // Plain link.
1735        write(
1736            root,
1737            "wiki/people/sarah.md",
1738            &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1739        );
1740        // Link with display text.
1741        write(
1742            root,
1743            "records/meetings/2026/05/m.md",
1744            &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
1745        );
1746        // Link with .md extension (accepted, warned by validate).
1747        write(
1748            root,
1749            "wiki/themes/t.md",
1750            &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}.md]]\n"),
1751        );
1752        // A catalog/index file also contains the link literally — included.
1753        write(
1754            root,
1755            "records/contacts/index.md",
1756            &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
1757        );
1758        // No link to the target.
1759        write(
1760            root,
1761            "wiki/people/elena.md",
1762            "---\ntype: wiki-page\nsummary: s\n---\nNo links here.\n",
1763        );
1764        // Short-form link must NOT match the full-path target.
1765        write(
1766            root,
1767            "wiki/people/bob.md",
1768            "---\ntype: wiki-page\nsummary: s\n---\n[[sarah-chen]]\n",
1769        );
1770        // A longer path that merely starts with the target must NOT match
1771        // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
1772        write(
1773            root,
1774            "wiki/people/jr.md",
1775            &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}-jr]]\n"),
1776        );
1777
1778        let store = open(&dir);
1779        let got = rels(&store.find_links_to(Path::new(target)).unwrap());
1780        assert_eq!(
1781            got,
1782            vec![
1783                "records/contacts/index.md".to_string(),
1784                "records/meetings/2026/05/m.md".to_string(),
1785                "wiki/people/sarah.md".to_string(),
1786                "wiki/themes/t.md".to_string(),
1787            ]
1788        );
1789    }
1790
1791    #[test]
1792    fn find_links_to_distinguishes_sibling_paths() {
1793        // Two contacts whose paths share a prefix; a link to one must not be
1794        // reported as a link to the other.
1795        let dir = empty_store();
1796        let root = dir.path();
1797        write(
1798            root,
1799            "wiki/a.md",
1800            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah]]\n",
1801        );
1802        write(
1803            root,
1804            "wiki/b.md",
1805            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1806        );
1807        let store = open(&dir);
1808
1809        assert_eq!(
1810            rels(
1811                &store
1812                    .find_links_to(Path::new("records/contacts/sarah"))
1813                    .unwrap()
1814            ),
1815            vec!["wiki/a.md".to_string()]
1816        );
1817        assert_eq!(
1818            rels(
1819                &store
1820                    .find_links_to(Path::new("records/contacts/sarah-chen"))
1821                    .unwrap()
1822            ),
1823            vec!["wiki/b.md".to_string()]
1824        );
1825    }
1826
1827    #[test]
1828    fn regression_find_links_to_tolerates_invalid_utf8_on_a_matched_line() {
1829        // Regression: the scan used the `UTF8` sink, which ran
1830        // `std::str::from_utf8` on every matched line and returned an
1831        // `io::Error` when a `.md` file carried a stray non-UTF-8 byte on the
1832        // SAME line as a `[[target]]` link. That error propagated out and
1833        // aborted the WHOLE store scan — `find_links_to` / `find_links_to_any`
1834        // (and `graph backlinks` + the working-set validate incoming-linker
1835        // pass) returned an error instead of the legitimate UTF-8 linkers.
1836        // Verbatim-ingested `sources/` artifacts can carry such bytes, so this
1837        // is reachable. The `Lossy` sink must let the scan still report the link.
1838        let dir = empty_store();
1839        let root = dir.path();
1840        let target = "records/contacts/sarah-chen";
1841
1842        // A clean, fully-UTF-8 linker that MUST be returned regardless.
1843        write(
1844            root,
1845            "wiki/people/clean.md",
1846            &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1847        );
1848
1849        // A linker whose link line ALSO carries a stray 0xFF byte (a mis-decoded
1850        // Latin-1 import). Write raw bytes so the invalid byte survives — a
1851        // `&str` fixture could not express it. The byte-level regex still
1852        // matches `[[target]]` on this line; pre-fix the UTF8 sink aborted here.
1853        let mut bytes: Vec<u8> =
1854            b"---\ntype: email\nsummary: s\n---\nSee [[records/contacts/sarah-chen]] \xFF here\n"
1855                .to_vec();
1856        let dirty_abs = root.join("sources/emails/2026/05/raw.md");
1857        fs::create_dir_all(dirty_abs.parent().unwrap()).unwrap();
1858        fs::write(&dirty_abs, &bytes).unwrap();
1859        // Defensive: confirm the fixture really is invalid UTF-8 (so the test
1860        // exercises the bug, not a coincidentally-valid file).
1861        assert!(
1862            std::str::from_utf8(&bytes).is_err(),
1863            "fixture must contain invalid UTF-8 to exercise the regression"
1864        );
1865        bytes.clear();
1866
1867        let store = open(&dir);
1868        let got = rels(
1869            &store
1870                .find_links_to(Path::new(target))
1871                .expect("a stray non-UTF-8 byte must not abort the backlink scan"),
1872        );
1873        assert_eq!(
1874            got,
1875            vec![
1876                "sources/emails/2026/05/raw.md".to_string(),
1877                "wiki/people/clean.md".to_string(),
1878            ],
1879            "both the clean linker and the one with an invalid byte on the link \
1880             line are reported; the scan degrades, it does not fail"
1881        );
1882    }
1883
1884    // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
1885
1886    /// The working-set validate's incoming-linker discovery runs through
1887    /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
1888    /// batch contract that makes that single-pass behavior correct: the result is
1889    /// the union of incoming linkers across every target, with per-target
1890    /// boundary correctness preserved (no alternation arm bleeds into a
1891    /// prefix-sharing sibling). If a regression reverts the batch finder to a
1892    /// per-object loop, the union below would still hold — but the boundary +
1893    /// union-equivalence assertions are what guard the *correctness* of folding N
1894    /// scans into one regex.
1895    #[test]
1896    fn find_links_to_any_returns_the_union_with_boundary_correctness() {
1897        let dir = empty_store();
1898        let root = dir.path();
1899
1900        // Two distinct targets, each with its own linker.
1901        write(
1902            root,
1903            "wiki/links-sarah.md",
1904            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1905        );
1906        write(
1907            root,
1908            "wiki/links-acme.md",
1909            "---\ntype: wiki-page\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
1910        );
1911        // One file links to BOTH targets — must appear exactly once (deduped),
1912        // proving the per-file early-exit folds multiple-target hits into a
1913        // single result row rather than one row per matched target.
1914        write(
1915            root,
1916            "records/meetings/2026/05/m.md",
1917            "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
1918             [[records/companies/acme]]\n",
1919        );
1920        // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
1921        // be reported as a link to `sarah-chen` even though the alternation now
1922        // carries `sarah-chen` as one arm.
1923        write(
1924            root,
1925            "wiki/links-jr.md",
1926            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
1927        );
1928        // A file that links to neither requested target.
1929        write(
1930            root,
1931            "wiki/unrelated.md",
1932            "---\ntype: wiki-page\nsummary: s\n---\n[[wiki/themes/spend]]\n",
1933        );
1934
1935        let store = open(&dir);
1936        let targets = vec![
1937            PathBuf::from("records/contacts/sarah-chen"),
1938            PathBuf::from("records/companies/acme"),
1939        ];
1940
1941        let got = rels(&store.find_links_to_any(&targets).unwrap());
1942        assert_eq!(
1943            got,
1944            vec![
1945                "records/meetings/2026/05/m.md".to_string(),
1946                "wiki/links-acme.md".to_string(),
1947                "wiki/links-sarah.md".to_string(),
1948            ],
1949            "batch finder must return the deduped union of linkers across all \
1950             targets, excluding the prefix-sibling and the unrelated file"
1951        );
1952
1953        // Equivalence: the batch result must equal the union of the per-target
1954        // single finder. This is the property the working-set path relies on
1955        // when it folds one-scan-per-object into one scan for the whole set.
1956        let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
1957        for t in &targets {
1958            for linker in store.find_links_to(t).unwrap() {
1959                union.insert(linker);
1960            }
1961        }
1962        assert_eq!(
1963            rels(&union.into_iter().collect::<Vec<_>>()),
1964            got,
1965            "find_links_to_any must equal the union of per-target find_links_to"
1966        );
1967    }
1968
1969    /// An empty target set must scan nothing and find nothing — and crucially
1970    /// must NOT compile to a match-everything empty regex (which would report
1971    /// every `.md` as a linker). This is the empty-working-set fast path the
1972    /// `validate` loop hits when nothing changed.
1973    #[test]
1974    fn find_links_to_any_empty_targets_matches_nothing() {
1975        let dir = empty_store();
1976        let root = dir.path();
1977        write(
1978            root,
1979            "wiki/a.md",
1980            "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1981        );
1982        let store = open(&dir);
1983
1984        assert!(
1985            store.find_links_to_any(&[]).unwrap().is_empty(),
1986            "no targets ⇒ no linkers (an empty pattern must not match every file)"
1987        );
1988        // A set of only empty/non-link targets is likewise a no-op, not a
1989        // match-everything.
1990        assert!(
1991            store
1992                .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
1993                .unwrap()
1994                .is_empty(),
1995            "targets that render to empty link text contribute no alternation arm"
1996        );
1997    }
1998
1999    // ── read_type_index ──────────────────────────────────────────────────────
2000
2001    #[test]
2002    fn read_type_index_parses_records_and_flattens_fields() {
2003        let dir = empty_store();
2004        let root = dir.path();
2005        let jsonl = "\
2006{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
2007{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
2008";
2009        let p = write(root, "records/expenses/index.jsonl", jsonl);
2010        let store = open(&dir);
2011        let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
2012
2013        assert_eq!(recs.len(), 2);
2014        // Sorted by path asc.
2015        assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
2016        assert_eq!(recs[0].type_, "expense");
2017        assert_eq!(recs[0].summary, "lunch");
2018        assert_eq!(recs[0].tags, vec!["meals".to_string()]);
2019        assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
2020        assert!(recs[0].created.is_some());
2021        // Extra (non-typed) frontmatter flattens into `fields`.
2022        assert_eq!(
2023            recs[0].fields.get("vendor"),
2024            Some(&serde_json::json!("acme"))
2025        );
2026        assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
2027        // Defaults: missing tags/links → empty.
2028        assert!(recs[1].tags.is_empty());
2029        assert!(recs[1].links.is_empty());
2030    }
2031
2032    #[test]
2033    fn read_type_index_last_write_wins_and_skips_blanks() {
2034        let dir = empty_store();
2035        let root = dir.path();
2036        // Same path twice; the second line supersedes the first. A blank line
2037        // in between must be ignored, not error.
2038        let jsonl = "\
2039{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
2040
2041{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
2042";
2043        let p = write(root, "records/contacts/index.jsonl", jsonl);
2044        let store = open(&dir);
2045        let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
2046        assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
2047        assert_eq!(recs[0].summary, "new", "later line must win");
2048    }
2049
2050    #[test]
2051    fn read_type_index_errors_on_malformed_line() {
2052        let dir = empty_store();
2053        let root = dir.path();
2054        let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
2055        let store = open(&dir);
2056        let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
2057        assert!(matches!(err, StoreError::BadTypeIndex { .. }));
2058    }
2059
2060    // ── find_by_type / find_by_where ─────────────────────────────────────────
2061
2062    fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
2063        format!(
2064            "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
2065        )
2066    }
2067
2068    #[test]
2069    fn find_by_type_reads_canonical_folder_sidecar() {
2070        let dir = empty_store();
2071        let root = dir.path();
2072        // Canonical folder for `contact` is records/contacts.
2073        write(
2074            root,
2075            "records/contacts/index.jsonl",
2076            &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
2077                + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
2078        );
2079        // A different type's sidecar must not leak into a contact query.
2080        write(
2081            root,
2082            "records/companies/index.jsonl",
2083            &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2084        );
2085        let store = open(&dir);
2086        let recs = store.find_by_type("contact").unwrap();
2087        let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
2088        assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
2089        assert!(recs.iter().all(|r| r.type_ == "contact"));
2090    }
2091
2092    #[test]
2093    fn regression_find_by_type_includes_non_canonical_folder_when_canonical_exists() {
2094        // Regression for the silent-incompleteness bug: once the canonical
2095        // type-folder sidecar exists, `find_by_type` used to read ONLY that
2096        // sidecar and drop same-type records filed in a non-canonical folder in
2097        // the SAME layer — so the result flipped to incomplete the moment a
2098        // canonical record was added. The write path actively enables such a
2099        // layout (`records/clients/` for a `contact`, `wiki/<topic>/` for any
2100        // `wiki-page`), so this is a reachable, dedup-breaking omission.
2101        let dir = empty_store();
2102        let root = dir.path();
2103
2104        // CANONICAL folder sidecar exists (`records/contacts/` for `contact`),
2105        // which is exactly the condition that triggered the bug.
2106        write(
2107            root,
2108            "records/contacts/index.jsonl",
2109            &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2110        );
2111        // A `contact` filed in a NON-canonical folder within the same (Records)
2112        // layer. Pre-fix this was silently dropped because the canonical
2113        // sidecar existed; it must now come back.
2114        write(
2115            root,
2116            "records/clients/index.jsonl",
2117            &jsonl_line("records/clients/elena.md", "contact", "Elena", ""),
2118        );
2119        // A different type in the same layer must NOT leak in (proves the read
2120        // is type-filtered, not just a blind whole-layer dump).
2121        write(
2122            root,
2123            "records/companies/index.jsonl",
2124            &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2125        );
2126
2127        let store = open(&dir);
2128        let got: std::collections::BTreeSet<String> = store
2129            .find_by_type("contact")
2130            .unwrap()
2131            .into_iter()
2132            .map(|r| r.path.to_string_lossy().into_owned())
2133            .collect();
2134        assert_eq!(
2135            got,
2136            ["records/clients/elena.md", "records/contacts/sarah.md"]
2137                .into_iter()
2138                .map(String::from)
2139                .collect::<std::collections::BTreeSet<_>>(),
2140            "both the canonical-folder and the non-canonical-folder contact must \
2141             be returned; the company record must be excluded"
2142        );
2143    }
2144
2145    #[test]
2146    fn regression_find_by_type_wiki_page_spans_multiple_topic_folders() {
2147        // Regression for the scoped-backlinks variant of the same bug
2148        // (`graph backlinks --type wiki-page`): `wiki-page`'s canonical folder
2149        // is `wiki/topics`, but the SPEC files wiki pages under `wiki/<topic>/`
2150        // for ANY topic. With a `wiki/topics/index.jsonl` present, the old code
2151        // read only that folder and dropped pages in `wiki/people/`,
2152        // `wiki/projects/`, etc. — under-reporting dependents in a blast-radius
2153        // check. The whole-`wiki/`-layer read must surface all of them.
2154        let dir = empty_store();
2155        let root = dir.path();
2156        write(
2157            root,
2158            "wiki/topics/index.jsonl",
2159            &jsonl_line("wiki/topics/billing.md", "wiki-page", "Billing", ""),
2160        );
2161        write(
2162            root,
2163            "wiki/people/index.jsonl",
2164            &jsonl_line("wiki/people/sarah-chen.md", "wiki-page", "Sarah Chen", ""),
2165        );
2166        write(
2167            root,
2168            "wiki/projects/index.jsonl",
2169            &jsonl_line("wiki/projects/atlas.md", "wiki-page", "Atlas", ""),
2170        );
2171
2172        let store = open(&dir);
2173        let got: std::collections::BTreeSet<String> = store
2174            .find_by_type("wiki-page")
2175            .unwrap()
2176            .into_iter()
2177            .map(|r| r.path.to_string_lossy().into_owned())
2178            .collect();
2179        assert_eq!(
2180            got,
2181            [
2182                "wiki/people/sarah-chen.md",
2183                "wiki/projects/atlas.md",
2184                "wiki/topics/billing.md",
2185            ]
2186            .into_iter()
2187            .map(String::from)
2188            .collect::<std::collections::BTreeSet<_>>(),
2189            "a wiki-page query must return pages from every topic folder, not \
2190             just the canonical wiki/topics/"
2191        );
2192    }
2193
2194    #[test]
2195    fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
2196        let dir = empty_store();
2197        let root = dir.path();
2198        // A custom `proposal` record filed under a non-canonical folder NAME
2199        // (the natural plural `records/proposals/`) inside the records layer.
2200        // `default_type_folder("proposal")` = `records/proposal` (bare type, no
2201        // pluralization guess), so the canonical sidecar does not exist and
2202        // `find_by_type` falls back. The fallback is bounded to the type's
2203        // layer (records), so this record — same layer, non-canonical folder —
2204        // is still found: completeness within the layer holds.
2205        write(
2206            root,
2207            "records/proposals/index.jsonl",
2208            &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
2209        );
2210        // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
2211        // old whole-store fallback read every sidecar in the store and would
2212        // have leaked this into the result; the layer-bounded fallback must not.
2213        // It also pins that the fallback is O(entities-in-layer), never O(store).
2214        write(
2215            root,
2216            "sources/proposals/index.jsonl",
2217            &jsonl_line(
2218                "sources/proposals/leak.md",
2219                "proposal",
2220                "cross-layer decoy",
2221                "",
2222            ),
2223        );
2224        let store = open(&dir);
2225        let recs = store.find_by_type("proposal").unwrap();
2226        assert_eq!(
2227            recs.len(),
2228            1,
2229            "only the records-layer proposal, not the sources decoy"
2230        );
2231        assert_eq!(recs[0].summary, "Q3 proposal");
2232        assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
2233    }
2234
2235    #[test]
2236    fn find_by_type_canonical_absent_does_not_read_other_layers() {
2237        let dir = empty_store();
2238        let root = dir.path();
2239        // `email`'s canonical folder is `sources/emails` (layer Sources). No
2240        // sidecar there yet, so `find_by_type("email")` falls back — but only
2241        // within the Sources layer. A populated sidecar in the Records layer
2242        // must never be touched: the fallback is layer-bounded, not store-wide.
2243        // Under the old `read_all_type_indexes_in(None)` fallback this records
2244        // sidecar would have been read and filtered (wasted O(store) I/O); now
2245        // it is outside the walk root entirely.
2246        write(
2247            root,
2248            "records/contacts/index.jsonl",
2249            &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2250        );
2251        let store = open(&dir);
2252        // No email anywhere ⇒ empty, and the records layer was not in scope.
2253        assert!(store.find_by_type("email").unwrap().is_empty());
2254    }
2255
2256    #[test]
2257    fn find_by_where_matches_typed_columns_and_flat_fields() {
2258        let dir = empty_store();
2259        let root = dir.path();
2260        write(
2261            root,
2262            "records/expenses/index.jsonl",
2263            &(jsonl_line(
2264                "records/expenses/a.md",
2265                "expense",
2266                "lunch",
2267                ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
2268            ) + &jsonl_line(
2269                "records/expenses/b.md",
2270                "expense",
2271                "taxi",
2272                ",\"vendor\":\"yellow\"",
2273            )),
2274        );
2275        write(
2276            root,
2277            "records/contacts/index.jsonl",
2278            &jsonl_line(
2279                "records/contacts/sarah.md",
2280                "contact",
2281                "Sarah",
2282                ",\"tags\":[\"customer\"]",
2283            ),
2284        );
2285        let store = open(&dir);
2286
2287        // Flat field in `fields`.
2288        let by_vendor = store.find_by_where("vendor", "acme").unwrap();
2289        assert_eq!(by_vendor.len(), 1);
2290        assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
2291
2292        // Typed column: type (spans both expense records).
2293        assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
2294
2295        // Typed list column: tags membership.
2296        let customers = store.find_by_where("tags", "customer").unwrap();
2297        assert_eq!(customers.len(), 1);
2298        assert_eq!(
2299            customers[0].path,
2300            PathBuf::from("records/contacts/sarah.md")
2301        );
2302
2303        // No match → empty.
2304        assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2305    }
2306
2307    #[test]
2308    fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2309        let dir = empty_store();
2310        let root = dir.path();
2311        // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2312        // serialized from such a file preserves it verbatim.
2313        write(
2314            root,
2315            "records/meetings/index.jsonl",
2316            "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2317\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2318\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2319        );
2320        let store = open(&dir);
2321
2322        // The exact value an agent reads out of the file (`Z` form) must match.
2323        let by_z = store
2324            .find_by_where("created", "2026-05-01T00:00:00Z")
2325            .unwrap();
2326        assert_eq!(by_z.len(), 1);
2327        assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2328
2329        // The equivalent explicit-offset spelling of the same instant matches too.
2330        assert_eq!(
2331            store
2332                .find_by_where("created", "2026-05-01T00:00:00+00:00")
2333                .unwrap()
2334                .len(),
2335            1
2336        );
2337
2338        // A non-UTC stored value matches both its own offset spelling and the
2339        // same instant expressed as `Z` (instant comparison, not string compare).
2340        assert_eq!(
2341            store
2342                .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2343                .unwrap()
2344                .len(),
2345            1
2346        );
2347        assert_eq!(
2348            store
2349                .find_by_where("updated", "2026-05-02T16:30:00Z")
2350                .unwrap()
2351                .len(),
2352            1
2353        );
2354
2355        // A different instant does not match.
2356        assert!(store
2357            .find_by_where("created", "2026-05-01T00:00:01Z")
2358            .unwrap()
2359            .is_empty());
2360        // A non-RFC3339 query value never matches a real timestamp.
2361        assert!(store
2362            .find_by_where("created", "2026-05-01")
2363            .unwrap()
2364            .is_empty());
2365    }
2366
2367    #[test]
2368    fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
2369        // The O(entities-in-layer) contract: a layer-scoped where read must walk
2370        // ONLY the named layer's subtree. Proven structurally — a *malformed*
2371        // sidecar in another layer would make `read_type_index` error if it were
2372        // read, so a scoped read that succeeds (and excludes that record) is
2373        // proof the other layer's I/O never happened.
2374        let dir = empty_store();
2375        let root = dir.path();
2376        write(
2377            root,
2378            "records/companies/index.jsonl",
2379            &jsonl_line(
2380                "records/companies/acme.md",
2381                "company",
2382                "Acme",
2383                ",\"domain\":\"acme.com\"",
2384            ),
2385        );
2386        // Same field/value in the sources layer — but the sidecar is corrupt.
2387        write(
2388            root,
2389            "sources/emails/index.jsonl",
2390            "{ this is not valid json and would error if read }\n",
2391        );
2392        let store = open(&dir);
2393
2394        // Scoped to records: the corrupt sources sidecar is out of scope, so the
2395        // read succeeds and returns only the records-layer match.
2396        let in_records = store
2397            .find_by_where_in("domain", "acme.com", Some(Layer::Records))
2398            .expect("a records-scoped read must not touch the sources sidecar");
2399        assert_eq!(
2400            rels(
2401                &in_records
2402                    .iter()
2403                    .map(|r| r.path.clone())
2404                    .collect::<Vec<_>>()
2405            ),
2406            vec!["records/companies/acme.md".to_string()]
2407        );
2408
2409        // The store-wide read DOES reach the corrupt sidecar and surfaces it as
2410        // a parse error — confirming the corrupt file is genuinely in the tree
2411        // and that only the layer scope spares it.
2412        let store_wide = store.find_by_where("domain", "acme.com");
2413        assert!(
2414            matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
2415            "unscoped read walks every layer and hits the corrupt sidecar"
2416        );
2417
2418        // Scoping to the layer that holds only the corrupt sidecar still errors
2419        // (the scope includes it), proving the scope is a real subtree bound and
2420        // not a silent "skip anything that fails".
2421        let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
2422        assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
2423    }
2424
2425    #[test]
2426    fn find_by_where_in_missing_layer_is_empty_not_an_error() {
2427        // A layer-scoped read over a layer folder that does not exist yet must
2428        // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
2429        // error from `ignore` over a nonexistent path.
2430        let dir = empty_store();
2431        let root = dir.path();
2432        write(
2433            root,
2434            "records/contacts/index.jsonl",
2435            &jsonl_line(
2436                "records/contacts/sarah.md",
2437                "contact",
2438                "Sarah",
2439                ",\"city\":\"denver\"",
2440            ),
2441        );
2442        let store = open(&dir);
2443
2444        // `wiki/` was never created.
2445        let in_wiki = store
2446            .find_by_where_in("city", "denver", Some(Layer::Wiki))
2447            .expect("missing layer subtree is empty, not an error");
2448        assert!(in_wiki.is_empty());
2449
2450        // Same query scoped to the layer that has the record still finds it.
2451        let in_records = store
2452            .find_by_where_in("city", "denver", Some(Layer::Records))
2453            .unwrap();
2454        assert_eq!(in_records.len(), 1);
2455    }
2456
2457    // ── abs_path / rel_path ──────────────────────────────────────────────────
2458
2459    #[test]
2460    fn abs_and_rel_path_roundtrip() {
2461        let dir = empty_store();
2462        let store = open(&dir);
2463        let rel = Path::new("records/contacts/sarah.md");
2464        let abs = store.abs_path(rel);
2465        assert_eq!(abs, dir.path().join(rel));
2466        assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
2467
2468        // An absolute path is passed through unchanged by abs_path.
2469        assert_eq!(store.abs_path(&abs), abs);
2470
2471        // A path outside the store has no store-relative form.
2472        assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
2473    }
2474
2475    // ── infer_type_from_path (inverse of default_type_folder) ────────────────
2476
2477    #[test]
2478    fn infer_type_maps_every_recognized_folder_back_to_its_type() {
2479        let cases = [
2480            ("sources/emails/x.md", "email"),
2481            ("sources/transcripts/x.md", "transcript"),
2482            ("sources/docs/x.md", "pdf-source"),
2483            ("records/contacts/x.md", "contact"),
2484            ("records/companies/x.md", "company"),
2485            ("records/expenses/x.md", "expense"),
2486            ("records/meetings/x.md", "meeting"),
2487            ("records/decisions/x.md", "decision"),
2488            ("records/invoices/x.md", "invoice"),
2489            // Any wiki sub-folder infers `wiki-page` regardless of the topic name.
2490            ("wiki/topics/x.md", "wiki-page"),
2491            ("wiki/pricing/x.md", "wiki-page"),
2492        ];
2493        for (path, expected) in cases {
2494            assert_eq!(
2495                infer_type_from_path(Path::new(path)).as_deref(),
2496                Some(expected),
2497                "path {path} should infer type {expected}"
2498            );
2499        }
2500    }
2501
2502    #[test]
2503    fn infer_type_round_trips_with_default_type_folder() {
2504        // The canonical invariant: inference is the inverse of the forward map.
2505        // Every recognized type, routed through `default_type_folder` and then
2506        // back through `infer_type_from_path`, must return the original type.
2507        // `wiki-page` is the one many-to-one case (every topic folder maps back
2508        // to `wiki-page`), so its forward folder still round-trips.
2509        let recognized = [
2510            "email",
2511            "transcript",
2512            "pdf-source",
2513            "contact",
2514            "company",
2515            "expense",
2516            "meeting",
2517            "decision",
2518            "invoice",
2519            "wiki-page",
2520        ];
2521        for type_ in recognized {
2522            let folder = default_type_folder(type_);
2523            let file = folder.join("x.md");
2524            assert_eq!(
2525                infer_type_from_path(&file).as_deref(),
2526                Some(type_),
2527                "recognized type {type_} (folder {folder:?}) must round-trip"
2528            );
2529        }
2530    }
2531
2532    #[test]
2533    fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
2534        // Regression guard for the CLI/core divergence: `default_type_folder`'s
2535        // unrecognized fallback is the BARE type name (`task → records/task`,
2536        // `tasks → records/tasks`). Inference must NOT singularize, or a custom
2537        // type would not round-trip (e.g. `records/tasks` → `task` would clash
2538        // with `default_type_folder("task") → records/task`).
2539        for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
2540            let folder = default_type_folder(custom);
2541            assert_eq!(folder, PathBuf::from("records").join(custom));
2542            let file = folder.join("x.md");
2543            assert_eq!(
2544                infer_type_from_path(&file).as_deref(),
2545                Some(custom),
2546                "custom type {custom} must round-trip verbatim (no singularization)"
2547            );
2548        }
2549
2550        // The specific case named in the finding: a plural custom folder keeps
2551        // its trailing `s`; it is NOT singularized to `task`.
2552        assert_eq!(
2553            infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
2554            Some("tasks"),
2555            "records/tasks must infer `tasks`, not `task`"
2556        );
2557    }
2558
2559    #[test]
2560    fn infer_type_requires_three_component_layer_folder_file_shape() {
2561        // Fewer than 3 components: a file directly under a layer has no
2562        // type-folder, so inference yields None (matches the old CLI contract).
2563        assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
2564        assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
2565        assert_eq!(infer_type_from_path(Path::new("wiki/x.md")), None);
2566        assert_eq!(infer_type_from_path(Path::new("x.md")), None);
2567        // Unknown leading layer is never inferred.
2568        assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
2569        // Deeper paths still infer from the first type-folder segment (e.g. a
2570        // sharded record under records/expenses/2026/05/x.md).
2571        assert_eq!(
2572            infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
2573            Some("expense"),
2574        );
2575    }
2576}