dbmd_core/store.rs
1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (embedded ripgrep,
12//! presence-only) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18
19use std::collections::BTreeMap;
20use std::path::{Path, PathBuf};
21
22use chrono::{DateTime, Datelike, FixedOffset};
23use grep::regex::RegexMatcher;
24use grep::searcher::sinks::UTF8;
25use grep::searcher::Searcher;
26use ignore::WalkBuilder;
27
28use crate::index::IndexRecord;
29use crate::parser::{parse_db_md, Config, Frontmatter};
30
31/// Basenames that are never content files: the config marker and the two
32/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
33/// content layers never mistakes a catalog for a record.
34///
35/// Only `index.md` is excluded by basename, because the content walks traverse
36/// the layer dirs (`sources/`/`records/`/`wiki/`) and `index.md` is the only
37/// meta file that appears INSIDE them. The root `DB.md` / `log.md` (and the
38/// `log/` archive) live at the store root, outside every layer, so they are
39/// never reached by these walks — and a content file that merely happens to be
40/// named `DB.md` or `log.md` inside a layer (e.g. `records/docs/DB.md`) is real
41/// content the SPEC does NOT reserve at type-folder depth.
42const NON_CONTENT_BASENAMES: [&str; 1] = ["index.md"];
43
44/// The complete machine-twin sidecar that backs every structured read.
45const TYPE_INDEX_FILE: &str = "index.jsonl";
46
47/// Returned when a path is opened as a store but has no `DB.md` at its root.
48/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
49#[derive(Debug, thiserror::Error)]
50#[error("not a db.md store: {path} has no DB.md")]
51pub struct NotAStore {
52 /// The path that was inspected.
53 pub path: PathBuf,
54}
55
56/// Errors from store-level operations (walk, locate, shard, sidecar read).
57#[derive(Debug, thiserror::Error)]
58pub enum StoreError {
59 /// A sidecar `index.jsonl` could not be read or parsed.
60 #[error("failed to read type index {path}: {message}")]
61 BadTypeIndex {
62 /// The sidecar file.
63 path: PathBuf,
64 /// What went wrong.
65 message: String,
66 },
67
68 /// A required date field for sharding was absent or unparseable, and there
69 /// was no usable fallback.
70 #[error("cannot compute shard path for {file}: no usable date field")]
71 NoShardDate {
72 /// The file being placed.
73 file: PathBuf,
74 },
75
76 /// An embedded-ripgrep scan failed to start or run.
77 #[error("search failed under {root}: {message}")]
78 Search {
79 /// The root the scan ran under.
80 root: PathBuf,
81 /// What went wrong.
82 message: String,
83 },
84
85 /// An underlying I/O failure.
86 #[error(transparent)]
87 Io(#[from] std::io::Error),
88}
89
90/// The three canonical layers of a db.md store.
91///
92/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
93/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
94/// declaration order (`Sources` < `Records` < `Wiki`) is the sort order.
95#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
96pub enum Layer {
97 /// `sources/` — raw evidence; immutable; date-sharded at scale.
98 Sources,
99 /// `records/` — atomic typed data; entity types flat, event types sharded.
100 Records,
101 /// `wiki/` — curator-synthesized narrative; flat.
102 Wiki,
103}
104
105impl Layer {
106 /// The on-disk folder name for this layer (`"sources"` / `"records"` /
107 /// `"wiki"`).
108 pub fn dir_name(self) -> &'static str {
109 match self {
110 Layer::Sources => "sources",
111 Layer::Records => "records",
112 Layer::Wiki => "wiki",
113 }
114 }
115
116 /// Parse a layer from its folder name; `None` for anything else.
117 pub fn from_dir_name(name: &str) -> Option<Self> {
118 match name {
119 "sources" => Some(Layer::Sources),
120 "records" => Some(Layer::Records),
121 "wiki" => Some(Layer::Wiki),
122 _ => None,
123 }
124 }
125
126 /// Every layer, in canonical order.
127 pub fn all() -> [Layer; 3] {
128 [Layer::Sources, Layer::Records, Layer::Wiki]
129 }
130}
131
132/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
133///
134/// Construct via [`Store::open`]; that is the only path in, and it validates
135/// the `DB.md` marker so downstream code can assume a real store.
136#[derive(Debug, Clone)]
137pub struct Store {
138 /// The store root (the directory containing `DB.md`).
139 pub root: PathBuf,
140 /// The parsed `DB.md` config (agent instructions, policies, schemas).
141 pub config: Config,
142}
143
144impl Store {
145 /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
146 /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
147 /// count (the lowercase name refers to the project/spec, not the marker).
148 pub fn is_db_md_store(path: &Path) -> bool {
149 // Read the directory and match the *stored* filename byte-for-byte.
150 // `path.join("DB.md").exists()` would lie on a case-insensitive
151 // filesystem (macOS default), where a lowercase `db.md` answers a
152 // `DB.md` probe. `read_dir` returns the real on-disk name, so the
153 // exact-match check is correct on both case-sensitive (Linux) and
154 // case-insensitive filesystems.
155 let entries = match std::fs::read_dir(path) {
156 Ok(entries) => entries,
157 Err(_) => return false,
158 };
159 for entry in entries.flatten() {
160 if entry.file_name() == "DB.md" {
161 // A directory literally named `DB.md` is not the marker.
162 match entry.file_type() {
163 Ok(ft) if ft.is_dir() => return false,
164 Ok(_) => return true,
165 Err(_) => return false,
166 }
167 }
168 }
169 false
170 }
171
172 /// Open `path` as a db.md store and require `DB.md` to be readable and
173 /// parseable. Normal commands should enter through this strict gate so a
174 /// damaged config cannot silently disable schema or policy rules.
175 pub fn open_strict(path: &Path) -> crate::Result<Store> {
176 if !Store::is_db_md_store(path) {
177 return Err(NotAStore {
178 path: path.to_path_buf(),
179 }
180 .into());
181 }
182 let db_md = path.join("DB.md");
183 let text = std::fs::read_to_string(&db_md)?;
184 let config = parse_db_md(&text, &db_md)?;
185 Ok(Store {
186 root: path.to_path_buf(),
187 config,
188 })
189 }
190
191 /// Open `path` as a db.md store: confirm the `DB.md` marker (else
192 /// [`NotAStore`]) and parse the `DB.md` config when possible. This is the
193 /// lenient validation-oriented open path: a damaged `DB.md` still marks the
194 /// directory as a store so `dbmd validate` can report the config error as an
195 /// issue. Normal CLI commands should use [`Store::open_strict`] instead.
196 pub fn open(path: &Path) -> Result<Store, NotAStore> {
197 if !Store::is_db_md_store(path) {
198 return Err(NotAStore {
199 path: path.to_path_buf(),
200 });
201 }
202 let db_md = path.join("DB.md");
203 // The marker exists; parse its config. A read or parse failure leaves
204 // the store openable with default config rather than masquerading as
205 // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
206 // DB.md is `dbmd validate`'s job to report, not `open`'s.
207 let config = match std::fs::read_to_string(&db_md) {
208 Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
209 Err(_) => Config::default(),
210 };
211 Ok(Store {
212 root: path.to_path_buf(),
213 config,
214 })
215 }
216
217 /// **SWEEP.** Recursively iterate every `.md` content file across
218 /// `sources/`, `records/`, and `wiki/`, skipping hidden dirs and `log/`.
219 /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
220 /// the interactive loop.
221 pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
222 // Only the three content layers — never root meta files (`DB.md`,
223 // `index.md`, `log.md`) and never `log/`, which live at root and are
224 // outside every layer dir.
225 let mut out = Vec::new();
226 for layer in Layer::all() {
227 out.extend(self.walk_layer(layer)?);
228 }
229 out.sort();
230 Ok(out)
231 }
232
233 /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
234 pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
235 let layer_root = self.root.join(layer.dir_name());
236 if !layer_root.is_dir() {
237 return Ok(Vec::new());
238 }
239 self.walk_content_md(&layer_root)
240 }
241
242 /// Enumerate every `.md` file in a single type-folder, **recursing through
243 /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
244 /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
245 pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
246 let abs = self.resolve_under_root(type_folder);
247 if !abs.is_dir() {
248 return Ok(Vec::new());
249 }
250 self.walk_content_md(&abs)
251 }
252
253 /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
254 /// (descending), ties broken by store-relative path (ascending) — a total
255 /// order, so write-through and rebuild never disagree on #500 vs #501.
256 ///
257 /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
258 /// `index rebuild`. The write-through path never calls this. The
259 /// cap-selection primitive for the 500-entry `index.md` browse view.
260 pub fn recent_in_type_folder(
261 &self,
262 type_folder: &Path,
263 n: usize,
264 ) -> Result<Vec<PathBuf>, StoreError> {
265 let files = self.walk_type_folder(type_folder)?;
266 // (updated, rel-path) for each file. Files missing/unparseable
267 // `updated` sort *after* dated ones (None last), then by path — so they
268 // are deterministically the lowest-priority candidates for the cap, not
269 // dropped silently. The total order (updated desc, path asc) is what
270 // keeps write-through and rebuild agreeing on #500 vs #501.
271 let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
272 .into_iter()
273 .map(|rel| {
274 let updated = self.read_updated(&self.abs_path(&rel));
275 (updated, rel)
276 })
277 .collect();
278 keyed.sort_by(|a, b| {
279 // `updated` descending: newest first. `None` is treated as the
280 // oldest possible, so dated files always win a cap slot over
281 // undated ones.
282 let by_updated = b.0.cmp(&a.0);
283 by_updated.then_with(|| a.1.cmp(&b.1))
284 });
285 keyed.truncate(n);
286 Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
287 }
288
289 /// The shard/flat predicate: true if the type date-shards, false if it
290 /// stays flat. True for source types and event record types
291 /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
292 /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
293 /// dedup-bounded entity types (`contact`/`company`/`decision`) and `wiki/`.
294 pub fn type_shards(&self, type_: &str) -> bool {
295 // A `DB.md ## Schemas` `### <type>` block with a `shard:` directive is
296 // authoritative — it is the v0.2 generic-model way to declare sharding,
297 // so it overrides the built-in default below (in either direction).
298 if let Some(shard) = self.config.schemas.get(type_).and_then(|s| s.shard) {
299 return shard;
300 }
301 // Built-in default for the example types. Sharding is a property of the
302 // *type*:
303 // - source types carry a primary date field and shard;
304 // - event record types track business volume and shard;
305 // - dedup-bounded entity types and curation-bounded wiki stay flat.
306 // Any type can override this via a `shard:` directive (above).
307 matches!(
308 type_,
309 // source types
310 "email" | "transcript" | "pdf-source"
311 // event record types (canonical)
312 | "expense" | "invoice" | "meeting"
313 // event record types (recognized custom, per the plan)
314 | "order" | "ticket" | "transaction"
315 )
316 }
317
318 /// Compute the canonical write path for a new file. For a sharding type
319 /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
320 /// primary date field (`email.date`, `expense.date`, … fallback `created`)
321 /// under the type folder; flat types and `wiki/` get no shard segment.
322 /// Deterministic + stable: same input → same path, so a record never moves
323 /// once written.
324 pub fn shard_path_for(
325 &self,
326 type_: &str,
327 frontmatter: &Frontmatter,
328 name: &str,
329 ) -> Result<PathBuf, StoreError> {
330 self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
331 }
332
333 /// Like [`Store::shard_path_for`], but compute the path under an explicit,
334 /// caller-resolved type-folder rather than the canonical default. This lets a
335 /// write surface honour an agent-supplied conforming sub-folder — e.g.
336 /// `wiki/projects/`, `wiki/people/`, `wiki/synthesis/` (the SPEC files a
337 /// `wiki-page` under `wiki/<topic>/`, i.e. ANY topic sub-folder, not only the
338 /// `wiki/topics` default) — while still applying date-sharding for sharding
339 /// types. The folder must be a conforming `<layer>/<type-folder>` (2
340 /// components, recognized layer); the caller is responsible for that (see the
341 /// CLI's `resolve_write_path`), so it is taken as given here.
342 ///
343 /// Sharding is still a property of the *type*: a sharding type gets the
344 /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
345 pub fn shard_path_in(
346 &self,
347 folder: &Path,
348 type_: &str,
349 frontmatter: &Frontmatter,
350 name: &str,
351 ) -> Result<PathBuf, StoreError> {
352 let folder = folder.to_path_buf();
353 let filename = ensure_md_extension(name);
354
355 if !self.type_shards(type_) {
356 // Flat type (entity records, wiki, decisions): no shard segment.
357 return Ok(folder.join(filename));
358 }
359
360 // Sharding type: derive <YYYY>/<MM> from the primary date field, with
361 // `created` as the universal fallback. Reading the public `Frontmatter`
362 // fields directly (typed `created`/`updated` + raw `extra`) avoids the
363 // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
364 let (year, month) = self
365 .primary_shard_segment(type_, frontmatter)
366 .ok_or_else(|| StoreError::NoShardDate {
367 file: folder.join(&filename),
368 })?;
369
370 Ok(folder.join(year).join(month).join(filename))
371 }
372
373 /// Find files with an incoming wiki-link to `target`, via **embedded
374 /// ripgrep** for `[[target]]` across all layers. Loop-fast; no whole-graph
375 /// build. Returns store-relative paths.
376 pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
377 // A single target is just the degenerate batch case — one alternation
378 // arm, one store scan. Routing through `find_links_to_any` keeps the
379 // pattern construction and the scan loop in exactly one place. The
380 // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
381 // single alloc on this single-target convenience path; the batch path
382 // validate.rs rides is untouched).
383 self.find_links_to_any(&[target.to_path_buf()])
384 }
385
386 /// Find every file with an incoming wiki-link to **any** of `targets`, in a
387 /// **single embedded-ripgrep pass** over the store (one `.md` walk, one
388 /// presence-only scan per file). This is the batch incoming-linker finder the
389 /// working-set [`crate::validate::validate_working_set`] sits on: it must find
390 /// the linkers for the *whole* changed set without paying a full store read
391 /// per changed object. Cost is therefore one store scan (O(store)), NOT
392 /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
393 /// in a loop would reread every `.md` once per target and is the exact
394 /// `O(changed × store)` blow-up this method exists to prevent. Returns
395 /// store-relative paths (deduped, sorted).
396 ///
397 /// Why content scan and not the sidecar `links` field: the sidecar projects
398 /// only the frontmatter `links:` array, so it misses edges written in the
399 /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
400 /// arbitrary path therefore requires reading file content — the same reason
401 /// the single-target finder uses ripgrep.
402 pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
403 // The wiki-link doctrine: a link is the full store-relative path, no
404 // `.md` extension. A reference to a target therefore appears literally
405 // as `[[<target>]]`, optionally with a `|display` suffix and (warned
406 // but accepted) a trailing `.md`. Build ONE regex that matches all
407 // accepted spellings of an incoming link to ANY target, escaping each
408 // target so path separators / dots stay literal and the alternation
409 // arms keep their boundaries (a link to `sarah` never matches
410 // `sarah-chen`).
411 let mut arms: Vec<String> = Vec::new();
412 for target in targets {
413 let target_str = path_to_link_str(target);
414 if target_str.is_empty() {
415 continue;
416 }
417 // [[ <target> (.md)? ( | display )? ]]
418 arms.push(format!(
419 r"\[\[{}(\.md)?(\|[^\]]*)?\]\]",
420 regex::escape(&target_str)
421 ));
422 }
423 // No usable targets → no possible incoming links, and an empty pattern
424 // would compile to a match-everything regex. Short-circuit instead.
425 if arms.is_empty() {
426 return Ok(Vec::new());
427 }
428 let pattern = arms.join("|");
429
430 let matcher = RegexMatcher::new(&pattern).map_err(|e| StoreError::Search {
431 root: self.root.clone(),
432 message: format!("invalid backlink pattern: {e}"),
433 })?;
434
435 let mut hits = std::collections::BTreeSet::new();
436 // Scan every `.md` file in the store (skip hidden + `log/`), including
437 // `index.md` catalogs — an incoming reference is wherever the literal
438 // link text lives; the caller decides relevance. ONE walk for the whole
439 // target set; per file we stop at the first hit (presence is all we
440 // need), so a file that links to several targets is read once, not once
441 // per target.
442 for rel in self.walk_all_md()? {
443 let abs = self.abs_path(&rel);
444 let mut matched_here = false;
445 let mut searcher = Searcher::new();
446 let res = searcher.search_path(
447 &matcher,
448 &abs,
449 UTF8(|_lnum, _line| {
450 matched_here = true;
451 // Stop at the first hit: presence is all we need.
452 Ok(false)
453 }),
454 );
455 if let Err(e) = res {
456 return Err(StoreError::Search {
457 root: self.root.clone(),
458 message: format!("search failed in {}: {e}", abs.display()),
459 });
460 }
461 if matched_here {
462 hits.insert(rel);
463 }
464 }
465 Ok(hits.into_iter().collect())
466 }
467
468 /// Candidate set for a `type` query: read the relevant type-folder
469 /// `index.jsonl` sidecar(s) and return their records. Complete and
470 /// cold-cache-proof — NOT a walk-and-parse or a frontmatter ripgrep scan,
471 /// and **never a store-wide read**. The common path is one sequential read
472 /// of the canonical type-folder sidecar (O(entities)); when that sidecar is
473 /// absent the read is bounded to the type's single layer subtree
474 /// (O(entities-in-layer)), so a `--type proposal` query before that folder
475 /// has been indexed still stays inside the interactive loop's O(entities)
476 /// contract instead of fanning out across every sidecar in the store.
477 pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
478 // Read the type's canonical-folder sidecar when it exists (the common,
479 // O(entities) path). Otherwise fall back to the sidecars of the *one
480 // layer* the type belongs to and filter by `type` — complete for records
481 // filed under a non-canonical folder name within that layer (e.g. a
482 // custom `proposal` filed in `records/proposals/` when the canonical
483 // guess is the bare `records/proposal/`), without the whole-store
484 // sidecar fan-out that would break the interactive loop's O(entities)
485 // contract. A type lives in exactly one layer, and `default_type_folder`
486 // always encodes it (recognized → its SPEC layer; unrecognized →
487 // `records/`), so the fallback walk is bounded to that layer's subtree —
488 // O(entities-in-layer), never O(store). Either way: sequential, complete
489 // sidecar reads, never a walk-and-parse of the tree.
490 let canonical_folder = default_type_folder(type_);
491 let canonical = self.root.join(&canonical_folder).join(TYPE_INDEX_FILE);
492 let records = if canonical.is_file() {
493 self.read_type_index(&canonical)?
494 } else {
495 self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?
496 };
497 Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
498 }
499
500 /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
501 /// every type-folder `index.jsonl` sidecar and filter their records. The
502 /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
503 /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
504 pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
505 self.find_by_where_in(key, value, None)
506 }
507
508 /// Candidate set for a `key=value` frontmatter query, **scoped to one
509 /// layer** when `layer` is `Some`: the sidecar walk is confined to that
510 /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
511 /// not O(store records). `None` keeps the store-wide read.
512 ///
513 /// This is what makes `--in <layer>` an I/O scope, not just a result
514 /// filter: a `--where`-only query (no `--type`) used to read every sidecar
515 /// in the store and narrow by layer in memory, breaking the O(entities)
516 /// contract the interactive loop depends on. With a layer in hand we walk
517 /// only that layer's sidecars.
518 pub fn find_by_where_in(
519 &self,
520 key: &str,
521 value: &str,
522 layer: Option<Layer>,
523 ) -> Result<Vec<IndexRecord>, StoreError> {
524 // A `key=value` query can target any frontmatter field across any type,
525 // so within the chosen subtree we still read every type-folder sidecar
526 // and filter. The layer (when given) bounds *which* subtree, turning a
527 // whole-store walk into a single-layer walk.
528 let records = self.read_all_type_indexes_in(layer)?;
529 Ok(records
530 .into_iter()
531 .filter(|r| record_matches_field(r, key, value))
532 .collect())
533 }
534
535 /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
536 /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
537 /// else store-wide. Sequential, complete sidecar reads — never a
538 /// walk-and-parse of the content tree.
539 ///
540 /// This is the unfiltered sidecar-enumeration primitive the relationship
541 /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
542 /// candidate set to the relevant layer (or the whole store) without opening
543 /// the content tree, then confirms each candidate's edge by parsing the file.
544 pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
545 self.read_all_type_indexes_in(layer)
546 }
547
548 /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
549 /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
550 /// primitive every structured query sits on.
551 pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
552 let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
553 path: index_jsonl.to_path_buf(),
554 message: e.to_string(),
555 })?;
556
557 // Last-write-wins by `path` over un-compacted lines: a later line for
558 // the same path supersedes an earlier one (the jsonl is append-mostly
559 // and only compacted on rebuild). Blank lines are skipped; a non-blank
560 // line that is not a valid IndexRecord is a hard parse error.
561 let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
562 for (i, line) in text.lines().enumerate() {
563 let trimmed = line.trim();
564 if trimmed.is_empty() {
565 continue;
566 }
567 let record: IndexRecord =
568 serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
569 path: index_jsonl.to_path_buf(),
570 message: format!("line {}: {e}", i + 1),
571 })?;
572 by_path.insert(record.path.clone(), record);
573 }
574 // BTreeMap keyed by path → records emerge sorted by path ascending,
575 // a deterministic order independent of line order in the file.
576 Ok(by_path.into_values().collect())
577 }
578
579 /// Resolve a store-relative path to its absolute on-disk path under
580 /// [`root`](Store::root).
581 pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
582 // `Path::join` returns `store_relative` unchanged if it is already
583 // absolute, so passing an absolute path through is a no-op.
584 self.root.join(store_relative)
585 }
586
587 /// Convert an absolute path under the store into its store-relative form.
588 pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
589 abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
590 }
591
592 // ── Private helpers ─────────────────────────────────────────────────────
593
594 /// Resolve a caller-supplied folder path (store-relative or absolute) to an
595 /// absolute path under the store root.
596 fn resolve_under_root(&self, folder: &Path) -> PathBuf {
597 if folder.is_absolute() {
598 folder.to_path_buf()
599 } else {
600 self.root.join(folder)
601 }
602 }
603
604 /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
605 /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
606 /// and type-folder walks.
607 fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
608 let mut out = Vec::new();
609 for entry in self.md_walker(root).build() {
610 let entry = entry.map_err(|e| StoreError::Search {
611 root: root.to_path_buf(),
612 message: e.to_string(),
613 })?;
614 if !is_file_entry(&entry) {
615 continue;
616 }
617 let path = entry.path();
618 if !has_md_extension(path) {
619 continue;
620 }
621 if is_non_content_basename(path) {
622 continue;
623 }
624 if let Some(rel) = self.rel_path(path) {
625 out.push(rel);
626 }
627 }
628 out.sort();
629 Ok(out)
630 }
631
632 /// Walk the whole store for **every** `.md` file (including `index.md`),
633 /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
634 /// scan, where the literal link text can live in any markdown file.
635 fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
636 let mut out = Vec::new();
637 for entry in self.md_walker(&self.root).build() {
638 let entry = entry.map_err(|e| StoreError::Search {
639 root: self.root.clone(),
640 message: e.to_string(),
641 })?;
642 if !is_file_entry(&entry) {
643 continue;
644 }
645 let path = entry.path();
646 if !has_md_extension(path) {
647 continue;
648 }
649 if self.is_in_log_dir(path) {
650 continue;
651 }
652 if let Some(rel) = self.rel_path(path) {
653 out.push(rel);
654 }
655 }
656 out.sort();
657 Ok(out)
658 }
659
660 /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
661 /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
662 /// read with last-write-wins by path; across sidecars, paths are disjoint by
663 /// construction (one sidecar per folder), so a plain concatenation preserves
664 /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
665 /// is what keeps `find_by_where_in` O(entities-in-layer).
666 fn read_all_type_indexes_in(
667 &self,
668 layer: Option<Layer>,
669 ) -> Result<Vec<IndexRecord>, StoreError> {
670 let mut out = Vec::new();
671 for sidecar in self.find_type_index_files_in(layer)? {
672 out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
673 }
674 Ok(out)
675 }
676
677 /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
678 /// whole store (skip hidden + `log/`), returning store-relative paths. The
679 /// walk root is `<root>/<layer>/` for a scoped read and `self.root` for the
680 /// store-wide read; a non-existent layer subtree yields no sidecars rather
681 /// than walking a missing path.
682 fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
683 let walk_root = match layer {
684 Some(l) => self.root.join(l.dir_name()),
685 None => self.root.clone(),
686 };
687 // A scoped walk over a layer folder that does not exist yet must be an
688 // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
689 // error from `ignore` over a nonexistent path.
690 if !walk_root.is_dir() {
691 return Ok(Vec::new());
692 }
693 let mut out = Vec::new();
694 let mut builder = WalkBuilder::new(&walk_root);
695 builder.standard_filters(false).hidden(true);
696 for entry in builder.build() {
697 let entry = entry.map_err(|e| StoreError::Search {
698 root: walk_root.clone(),
699 message: e.to_string(),
700 })?;
701 if !is_file_entry(&entry) {
702 continue;
703 }
704 let path = entry.path();
705 if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
706 continue;
707 }
708 if self.is_in_log_dir(path) {
709 continue;
710 }
711 if let Some(rel) = self.rel_path(path) {
712 out.push(rel);
713 }
714 }
715 out.sort();
716 Ok(out)
717 }
718
719 /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
720 /// OFF (a SWEEP must see every file even if the store is a git repo with a
721 /// `.gitignore`), but hidden files/dirs are skipped.
722 fn md_walker(&self, root: &Path) -> WalkBuilder {
723 let mut builder = WalkBuilder::new(root);
724 builder.standard_filters(false).hidden(true);
725 builder
726 }
727
728 /// True if an absolute path lives under the store's root-level `log/`
729 /// rotation-archive directory.
730 fn is_in_log_dir(&self, abs: &Path) -> bool {
731 match self.rel_path(abs) {
732 Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
733 None => false,
734 }
735 }
736
737 /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
738 /// returning `None` when absent/unparseable. A self-contained reader (does
739 /// not depend on the not-yet-implemented `parser::read_file`); parses the
740 /// leading `---`-fenced YAML block with the same engine the parser uses.
741 fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
742 let text = std::fs::read_to_string(abs).ok()?;
743 let yaml = frontmatter_block(&text)?;
744 let value: serde_norway::Value = serde_norway::from_str(yaml).ok()?;
745 let raw = value.get("updated")?;
746 value_to_datetime(raw)
747 }
748
749 /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
750 /// date field with a `created` fallback. Reads the public `Frontmatter`
751 /// fields directly. `None` when no usable date is present.
752 fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
753 // Try the type's primary date field first.
754 if let Some(field) = primary_date_field(type_) {
755 if let Some(v) = fm.extra.get(field) {
756 if let Some(seg) = value_to_year_month(v) {
757 return Some(seg);
758 }
759 }
760 }
761 // Universal fallback: the typed `created` timestamp.
762 fm.created
763 .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
764 }
765}
766
767// ── Free helpers (no `self`) ────────────────────────────────────────────────
768
769/// True if a walk entry is a regular file (not a dir / symlink-to-dir).
770fn is_file_entry(entry: &ignore::DirEntry) -> bool {
771 entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
772}
773
774/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
775/// lowercase `.md`).
776fn has_md_extension(path: &Path) -> bool {
777 path.extension().and_then(|e| e.to_str()) == Some("md")
778}
779
780/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
781/// `log.md`) that the content walks must skip.
782fn is_non_content_basename(path: &Path) -> bool {
783 match path.file_name().and_then(|n| n.to_str()) {
784 Some(name) => NON_CONTENT_BASENAMES.contains(&name),
785 None => false,
786 }
787}
788
789/// Append `.md` to a bare name; leave an existing `.md` untouched.
790fn ensure_md_extension(name: &str) -> String {
791 if name.ends_with(".md") {
792 name.to_string()
793 } else {
794 format!("{name}.md")
795 }
796}
797
798/// Render a store-relative path as a wiki-link target string with `/`
799/// separators (never `\`), no leading `./`, no trailing `.md`.
800fn path_to_link_str(target: &Path) -> String {
801 let mut parts: Vec<String> = Vec::new();
802 for comp in target.components() {
803 if let std::path::Component::Normal(os) = comp {
804 if let Some(s) = os.to_str() {
805 parts.push(s.to_string());
806 }
807 }
808 }
809 let mut joined = parts.join("/");
810 if let Some(stripped) = joined.strip_suffix(".md") {
811 joined = stripped.to_string();
812 }
813 joined
814}
815
816/// The canonical default folder for a recognized type, per the SPEC type table
817/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
818/// types fall back to `records/<type>` (the bare type name, no pluralization
819/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
820fn default_type_folder(type_: &str) -> PathBuf {
821 let path = match type_ {
822 // sources
823 "email" => "sources/emails",
824 "transcript" => "sources/transcripts",
825 "pdf-source" => "sources/docs",
826 // records — entities
827 "contact" => "records/contacts",
828 "company" => "records/companies",
829 // records — events
830 "expense" => "records/expenses",
831 "meeting" => "records/meetings",
832 "decision" => "records/decisions",
833 "invoice" => "records/invoices",
834 // wiki — the SPEC type table files a wiki-page under `wiki/<topic>/`,
835 // i.e. ALWAYS a sub-folder, never flat under `wiki/`. A 2-component
836 // `wiki/<file>` path is non-conforming: `index::type_folder_of` /
837 // `validate::type_folder_of` require `<layer>/<type-folder>/<file>` (3
838 // components), so a flat wiki page either crashes write-through
839 // (`on_write` tries to create `index.md` *inside* a file) or is silently
840 // dropped from every catalog by `rebuild_all`. `topic` is the page's
841 // canonical bucket; with only the bare type in hand here, `wiki/topics`
842 // is the deterministic default folder (matches the dogfood store).
843 "wiki-page" => "wiki/topics",
844 // unrecognized: bare type name under records/
845 other => return PathBuf::from("records").join(other),
846 };
847 PathBuf::from(path)
848}
849
850/// The canonical [`Layer`] a `type_` belongs to, derived from its default
851/// type-folder (`email` → `Sources`, `contact` → `Records`, `wiki-page` →
852/// `Wiki`, unrecognized → `Records`). The write path uses this to decide whether
853/// an agent-supplied folder is in the *right* layer for the type before honouring
854/// its sub-folder choice.
855pub fn layer_for_type(type_: &str) -> Layer {
856 layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
857}
858
859/// The [`Layer`] a type-folder path lives in, read from its first component
860/// (`sources/` → `Sources`, `records/` → `Records`, `wiki/` → `Wiki`). Used to
861/// bound [`Store::find_by_type`]'s canonical-folder-absent fallback to a single
862/// layer subtree. Returns `None` for a path with no recognized layer prefix;
863/// every value [`default_type_folder`] produces has one, so in practice this is
864/// always `Some` on the call path — `None` degrades to a store-wide read.
865fn layer_of_folder(folder: &Path) -> Option<Layer> {
866 let first = folder.components().next()?.as_os_str().to_str()?;
867 Layer::from_dir_name(first)
868}
869
870/// Infer a content file's canonical `type` from its store-relative path — the
871/// inverse of [`default_type_folder`] and the single source of truth for
872/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
873///
874/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
875/// shorter path (a file directly under a layer) or an unknown leading layer
876/// yields `None`.
877///
878/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
879/// unrecognized folder the fallback is the **bare folder name verbatim** (no
880/// pluralization/singularization) so it round-trips with `default_type_folder`,
881/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
882/// Singularizing here would break that round-trip (`records/tasks` → `task`
883/// while `default_type_folder("task")` → `records/task`). `wiki/<topic>` always
884/// infers `wiki-page`, since every wiki page is filed under a topic folder.
885pub fn infer_type_from_path(rel: &Path) -> Option<String> {
886 let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
887 let layer = comps.next()?;
888 if !matches!(layer, "sources" | "records" | "wiki") {
889 return None;
890 }
891 let folder = comps.next()?;
892 // The file itself must be a third component (a real type-folder, not the
893 // file sitting directly under the layer).
894 comps.next()?;
895
896 let mapped = match (layer, folder) {
897 ("sources", "emails") => "email",
898 ("sources", "transcripts") => "transcript",
899 ("sources", "docs") => "pdf-source",
900 ("records", "contacts") => "contact",
901 ("records", "companies") => "company",
902 ("records", "expenses") => "expense",
903 ("records", "meetings") => "meeting",
904 ("records", "decisions") => "decision",
905 ("records", "invoices") => "invoice",
906 // Every wiki page is filed under `wiki/<topic>/`; the type is always
907 // `wiki-page` regardless of the topic-folder name.
908 ("wiki", _) => "wiki-page",
909 // Unrecognized folder: the bare name, verbatim. This is the inverse of
910 // `default_type_folder`'s unrecognized fallback (`other → records/other`)
911 // and the round-trip would break if we pluralized/singularized here.
912 (_, other) => other,
913 };
914 Some(mapped.to_string())
915}
916
917/// The primary date field name for a sharding type (the field whose value
918/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
919fn primary_date_field(type_: &str) -> Option<&'static str> {
920 match type_ {
921 "email" => Some("date"),
922 "transcript" => Some("recorded_at"),
923 "pdf-source" => Some("received_at"),
924 "expense" | "invoice" | "meeting" => Some("date"),
925 // recognized custom event types have no canonical date field name; they
926 // fall back to `created`.
927 _ => None,
928 }
929}
930
931/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
932/// string and a YAML-native scalar rendered to string.
933fn value_to_datetime(value: &serde_norway::Value) -> Option<DateTime<FixedOffset>> {
934 let s = yaml_scalar_string(value)?;
935 DateTime::parse_from_rfc3339(s.trim()).ok()
936}
937
938/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
939/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
940/// `2026-05-22T10:00:00-07:00` timestamp both work.
941fn value_to_year_month(value: &serde_norway::Value) -> Option<(String, String)> {
942 let s = yaml_scalar_string(value)?;
943 year_month_from_str(s.trim())
944}
945
946/// `(YYYY, MM)` from the leading `YYYY-MM` of a date string.
947fn year_month_from_str(s: &str) -> Option<(String, String)> {
948 // Hand-roll the leading-`YYYY-MM` parse to avoid a regex compile on the
949 // write path. Require: 4 digits, '-', 2 digits.
950 let bytes = s.as_bytes();
951 if bytes.len() < 7 {
952 return None;
953 }
954 let is_digit = |b: u8| b.is_ascii_digit();
955 if !(is_digit(bytes[0])
956 && is_digit(bytes[1])
957 && is_digit(bytes[2])
958 && is_digit(bytes[3])
959 && bytes[4] == b'-'
960 && is_digit(bytes[5])
961 && is_digit(bytes[6]))
962 {
963 return None;
964 }
965 let month: u8 = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
966 if !(1..=12).contains(&month) {
967 return None;
968 }
969 Some((s[0..4].to_string(), s[5..7].to_string()))
970}
971
972/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
973/// value's compact YAML serialization (covers timestamps that the YAML engine
974/// may surface as a non-string scalar).
975fn yaml_scalar_string(value: &serde_norway::Value) -> Option<String> {
976 if let Some(s) = value.as_str() {
977 return Some(s.to_string());
978 }
979 match value {
980 serde_norway::Value::Null => None,
981 serde_norway::Value::Mapping(_) | serde_norway::Value::Sequence(_) => None,
982 other => serde_norway::to_string(other)
983 .ok()
984 .map(|s| s.trim().to_string()),
985 }
986}
987
988/// The YAML frontmatter block of a file: the text between a leading `---` fence
989/// and the next `---` fence, exclusive. `None` if the file does not open with a
990/// `---` fence on its first line.
991fn frontmatter_block(text: &str) -> Option<&str> {
992 // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
993 let body = text.strip_prefix('\u{feff}').unwrap_or(text);
994 let mut rest = body;
995 // First line must be exactly `---` (allowing trailing CR).
996 let (first, after_first) = split_first_line(rest);
997 if first.trim_end_matches('\r') != "---" {
998 return None;
999 }
1000 rest = after_first;
1001 let block_start = rest;
1002 let mut scanned = 0usize;
1003 loop {
1004 let (line, after) = split_first_line(rest);
1005 if line.trim_end_matches('\r') == "---" {
1006 return Some(&block_start[..scanned]);
1007 }
1008 if after.is_empty() && line.is_empty() {
1009 // Reached end of input without a closing fence.
1010 return None;
1011 }
1012 scanned += line.len() + 1; // +1 for the consumed '\n'
1013 if after.is_empty() {
1014 return None;
1015 }
1016 rest = after;
1017 }
1018}
1019
1020/// Split a string into (first line without its trailing `\n`, remainder after
1021/// the `\n`). If there is no newline, the whole string is the line and the
1022/// remainder is empty.
1023fn split_first_line(s: &str) -> (&str, &str) {
1024 match s.find('\n') {
1025 Some(i) => (&s[..i], &s[i + 1..]),
1026 None => (s, ""),
1027 }
1028}
1029
1030/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
1031/// typed columns first and then the flattened `fields` map.
1032fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
1033 match key {
1034 "type" => record.type_ == value,
1035 "summary" => record.summary == value,
1036 "path" => record.path.to_string_lossy() == value,
1037 "created" => timestamp_matches(record.created, value),
1038 "updated" => timestamp_matches(record.updated, value),
1039 "tags" => record.tags.iter().any(|t| t == value),
1040 "links" => record.links.iter().any(|l| l == value),
1041 other => record
1042 .fields
1043 .get(other)
1044 .map(|v| json_value_matches(v, value))
1045 .unwrap_or(false),
1046 }
1047}
1048
1049/// Compare a record's `created`/`updated` instant against a query `value`.
1050///
1051/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1052/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1053/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1054/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1055/// the `Z` form an agent reads straight out of the file would never match. We
1056/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1057/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1058/// can never equal a real timestamp, so it falls through to `false`.
1059fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1060 match (stored, DateTime::parse_from_rfc3339(value)) {
1061 (Some(stored), Ok(queried)) => stored == queried,
1062 _ => false,
1063 }
1064}
1065
1066/// Compare a JSON field value against a query string. A string matches
1067/// verbatim; scalars match their textual form; an array matches if any element
1068/// matches (so a list-valued frontmatter field is membership-queried).
1069fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1070 match v {
1071 serde_json::Value::String(s) => s == value,
1072 serde_json::Value::Bool(b) => b.to_string() == value,
1073 serde_json::Value::Number(n) => n.to_string() == value,
1074 serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1075 // A present-but-null field never matches — consistent with the in-memory
1076 // post-filter (`query::json_value_matches`, which the first `where`
1077 // clause is NOT re-checked against, so the two must agree here or a
1078 // `--where field=` query would return different rows than `--type X
1079 // --where field=`).
1080 serde_json::Value::Null => false,
1081 serde_json::Value::Object(_) => false,
1082 }
1083}
1084
1085#[cfg(test)]
1086mod tests {
1087 use super::*;
1088 use std::fs;
1089 use tempfile::{tempdir, TempDir};
1090
1091 // ── Fixtures ────────────────────────────────────────────────────────────
1092
1093 /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1094 /// store-relative path for convenient assertions.
1095 fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1096 let abs = root.join(rel);
1097 fs::create_dir_all(abs.parent().unwrap()).unwrap();
1098 fs::write(&abs, contents).unwrap();
1099 PathBuf::from(rel)
1100 }
1101
1102 /// A minimal content file with the given `updated` timestamp in frontmatter.
1103 fn content_md(updated: &str) -> String {
1104 format!(
1105 "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1106 )
1107 }
1108
1109 /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1110 /// real parser is exercised).
1111 fn empty_store() -> TempDir {
1112 let dir = tempdir().unwrap();
1113 fs::write(
1114 dir.path().join("DB.md"),
1115 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1116 )
1117 .unwrap();
1118 dir
1119 }
1120
1121 /// Open a store rooted at a TempDir; panics if `open` rejects it.
1122 fn open(dir: &TempDir) -> Store {
1123 Store::open(dir.path()).expect("fixture should be a valid store")
1124 }
1125
1126 fn rels(paths: &[PathBuf]) -> Vec<String> {
1127 paths
1128 .iter()
1129 .map(|p| p.to_string_lossy().replace('\\', "/"))
1130 .collect()
1131 }
1132
1133 // ── Layer ───────────────────────────────────────────────────────────────
1134
1135 #[test]
1136 fn layer_dir_name_and_parse_are_inverse() {
1137 for layer in Layer::all() {
1138 assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1139 }
1140 assert_eq!(Layer::Sources.dir_name(), "sources");
1141 assert_eq!(Layer::Records.dir_name(), "records");
1142 assert_eq!(Layer::Wiki.dir_name(), "wiki");
1143 assert_eq!(Layer::from_dir_name("log"), None);
1144 assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1145 }
1146
1147 #[test]
1148 fn layer_order_is_canonical() {
1149 // stats keys a BTreeMap on Layer; the sort order must be sources<records<wiki.
1150 let mut v = [Layer::Wiki, Layer::Sources, Layer::Records];
1151 v.sort();
1152 assert_eq!(v, [Layer::Sources, Layer::Records, Layer::Wiki]);
1153 }
1154
1155 // ── is_db_md_store / open ────────────────────────────────────────────────
1156
1157 #[test]
1158 fn is_store_true_only_with_uppercase_marker() {
1159 let dir = tempdir().unwrap();
1160 assert!(
1161 !Store::is_db_md_store(dir.path()),
1162 "no marker → not a store"
1163 );
1164
1165 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1166 assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1167 }
1168
1169 #[test]
1170 fn is_store_false_for_lowercase_db_md() {
1171 // The case-sensitivity contract: a lowercase db.md is the spec name, not
1172 // a marker — even on a case-insensitive filesystem where Path::exists
1173 // would lie. This test must pass on macOS (case-insensitive) too.
1174 let dir = tempdir().unwrap();
1175 fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1176 assert!(
1177 !Store::is_db_md_store(dir.path()),
1178 "lowercase db.md must NOT be treated as a store marker"
1179 );
1180 assert!(Store::open(dir.path()).is_err());
1181 }
1182
1183 #[test]
1184 fn is_store_false_when_db_md_is_a_directory() {
1185 let dir = tempdir().unwrap();
1186 fs::create_dir(dir.path().join("DB.md")).unwrap();
1187 assert!(
1188 !Store::is_db_md_store(dir.path()),
1189 "a directory named DB.md is not the file marker"
1190 );
1191 }
1192
1193 #[test]
1194 fn open_rejects_non_store_with_path() {
1195 let dir = tempdir().unwrap();
1196 let err = Store::open(dir.path()).unwrap_err();
1197 assert_eq!(err.path, dir.path());
1198 }
1199
1200 #[test]
1201 fn open_succeeds_and_parses_config() {
1202 let dir = tempdir().unwrap();
1203 // A DB.md whose ## Policies declares a frozen page — proves open()
1204 // actually parsed the config rather than substituting a default.
1205 fs::write(
1206 dir.path().join("DB.md"),
1207 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1208 ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1209 )
1210 .unwrap();
1211 let store = Store::open(dir.path()).unwrap();
1212 assert_eq!(store.root, dir.path());
1213 assert!(
1214 store
1215 .config
1216 .frozen_pages
1217 .iter()
1218 .any(|p| p == Path::new("records/decisions/q1.md")),
1219 "open() must surface DB.md ## Policies, got {:?}",
1220 store.config.frozen_pages
1221 );
1222 }
1223
1224 // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1225
1226 #[test]
1227 fn walk_collects_content_across_layers_skipping_meta_and_log() {
1228 let dir = empty_store();
1229 let root = dir.path();
1230 write(
1231 root,
1232 "sources/emails/2026/05/a.md",
1233 &content_md("2026-05-01T00:00:00Z"),
1234 );
1235 write(
1236 root,
1237 "records/contacts/sarah.md",
1238 &content_md("2026-05-02T00:00:00Z"),
1239 );
1240 write(
1241 root,
1242 "wiki/people/sarah.md",
1243 &content_md("2026-05-03T00:00:00Z"),
1244 );
1245 // Things walk() must SKIP:
1246 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1247 write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1248 write(root, "log.md", "---\ntype: log\n---\n"); // log
1249 write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1250 write(
1251 root,
1252 "sources/.hidden/secret.md",
1253 &content_md("2026-05-09T00:00:00Z"),
1254 ); // hidden dir
1255 write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1256
1257 let store = open(&dir);
1258 let got = rels(&store.walk().unwrap());
1259 assert_eq!(
1260 got,
1261 vec![
1262 "records/contacts/sarah.md".to_string(),
1263 "sources/emails/2026/05/a.md".to_string(),
1264 "wiki/people/sarah.md".to_string(),
1265 ]
1266 );
1267 }
1268
1269 #[test]
1270 fn walk_includes_content_named_log_md_or_db_md_inside_a_layer() {
1271 let dir = empty_store();
1272 let root = dir.path();
1273 // A content file that merely happens to be named log.md / DB.md INSIDE a
1274 // layer is real content — those names are reserved only at the store root.
1275 write(
1276 root,
1277 "records/configs/log.md",
1278 &content_md("2026-05-01T00:00:00Z"),
1279 );
1280 write(
1281 root,
1282 "sources/docs/DB.md",
1283 &content_md("2026-05-02T00:00:00Z"),
1284 );
1285 // The derived catalog twin is still skipped at any depth.
1286 write(root, "records/configs/index.md", "---\ntype: index\n---\n");
1287 let store = open(&dir);
1288 let got = rels(&store.walk().unwrap());
1289 assert!(
1290 got.contains(&"records/configs/log.md".to_string()),
1291 "layer-internal log.md is content: {got:?}"
1292 );
1293 assert!(
1294 got.contains(&"sources/docs/DB.md".to_string()),
1295 "layer-internal DB.md is content: {got:?}"
1296 );
1297 assert!(
1298 !got.iter().any(|p| p.ends_with("index.md")),
1299 "index.md is still skipped: {got:?}"
1300 );
1301 }
1302
1303 #[test]
1304 fn walk_layer_is_scoped() {
1305 let dir = empty_store();
1306 let root = dir.path();
1307 write(
1308 root,
1309 "sources/emails/2026/05/a.md",
1310 &content_md("2026-05-01T00:00:00Z"),
1311 );
1312 write(
1313 root,
1314 "records/contacts/sarah.md",
1315 &content_md("2026-05-02T00:00:00Z"),
1316 );
1317 let store = open(&dir);
1318
1319 assert_eq!(
1320 rels(&store.walk_layer(Layer::Sources).unwrap()),
1321 vec!["sources/emails/2026/05/a.md".to_string()]
1322 );
1323 assert_eq!(
1324 rels(&store.walk_layer(Layer::Records).unwrap()),
1325 vec!["records/contacts/sarah.md".to_string()]
1326 );
1327 // A layer with no directory is empty, not an error.
1328 assert!(store.walk_layer(Layer::Wiki).unwrap().is_empty());
1329 }
1330
1331 #[test]
1332 fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1333 let dir = empty_store();
1334 let root = dir.path();
1335 write(
1336 root,
1337 "sources/emails/2026/05/a.md",
1338 &content_md("2026-05-01T00:00:00Z"),
1339 );
1340 write(
1341 root,
1342 "sources/emails/2026/06/b.md",
1343 &content_md("2026-06-01T00:00:00Z"),
1344 );
1345 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1346 // A different type folder must not leak in.
1347 write(
1348 root,
1349 "sources/docs/2026/05/c.md",
1350 &content_md("2026-05-04T00:00:00Z"),
1351 );
1352 let store = open(&dir);
1353
1354 let expected = vec![
1355 "sources/emails/2026/05/a.md".to_string(),
1356 "sources/emails/2026/06/b.md".to_string(),
1357 ];
1358 // Relative folder arg.
1359 assert_eq!(
1360 rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1361 expected
1362 );
1363 // Absolute folder arg under the store resolves identically.
1364 assert_eq!(
1365 rels(
1366 &store
1367 .walk_type_folder(&root.join("sources/emails"))
1368 .unwrap()
1369 ),
1370 expected
1371 );
1372 }
1373
1374 // ── recent_in_type_folder ────────────────────────────────────────────────
1375
1376 #[test]
1377 fn recent_orders_by_updated_desc_then_path_and_caps() {
1378 let dir = empty_store();
1379 let root = dir.path();
1380 // newest
1381 write(
1382 root,
1383 "records/meetings/2026/05/c.md",
1384 &content_md("2026-05-03T00:00:00Z"),
1385 );
1386 // tie on updated — path asc decides (a before b)
1387 write(
1388 root,
1389 "records/meetings/2026/05/a.md",
1390 &content_md("2026-05-02T00:00:00Z"),
1391 );
1392 write(
1393 root,
1394 "records/meetings/2026/05/b.md",
1395 &content_md("2026-05-02T00:00:00Z"),
1396 );
1397 // oldest
1398 write(
1399 root,
1400 "records/meetings/2026/04/z.md",
1401 &content_md("2026-04-01T00:00:00Z"),
1402 );
1403 let store = open(&dir);
1404
1405 let all = rels(
1406 &store
1407 .recent_in_type_folder(Path::new("records/meetings"), 10)
1408 .unwrap(),
1409 );
1410 assert_eq!(
1411 all,
1412 vec![
1413 "records/meetings/2026/05/c.md".to_string(), // newest
1414 "records/meetings/2026/05/a.md".to_string(), // tie, path asc
1415 "records/meetings/2026/05/b.md".to_string(),
1416 "records/meetings/2026/04/z.md".to_string(), // oldest
1417 ]
1418 );
1419
1420 // Cap takes the n most-recent.
1421 let top2 = rels(
1422 &store
1423 .recent_in_type_folder(Path::new("records/meetings"), 2)
1424 .unwrap(),
1425 );
1426 assert_eq!(
1427 top2,
1428 vec![
1429 "records/meetings/2026/05/c.md".to_string(),
1430 "records/meetings/2026/05/a.md".to_string(),
1431 ]
1432 );
1433 }
1434
1435 #[test]
1436 fn recent_sorts_undated_files_last() {
1437 let dir = empty_store();
1438 let root = dir.path();
1439 write(
1440 root,
1441 "records/contacts/dated.md",
1442 &content_md("2026-05-01T00:00:00Z"),
1443 );
1444 // No `updated` field at all.
1445 write(
1446 root,
1447 "records/contacts/undated.md",
1448 "---\ntype: contact\nsummary: x\n---\nbody\n",
1449 );
1450 let store = open(&dir);
1451 let got = rels(
1452 &store
1453 .recent_in_type_folder(Path::new("records/contacts"), 10)
1454 .unwrap(),
1455 );
1456 assert_eq!(
1457 got,
1458 vec![
1459 "records/contacts/dated.md".to_string(),
1460 "records/contacts/undated.md".to_string(),
1461 ],
1462 "a file with a real `updated` must outrank one with none"
1463 );
1464 }
1465
1466 // ── type_shards ──────────────────────────────────────────────────────────
1467
1468 #[test]
1469 fn type_shards_classification() {
1470 let dir = empty_store();
1471 let store = open(&dir);
1472 for t in [
1473 "email",
1474 "transcript",
1475 "pdf-source",
1476 "expense",
1477 "invoice",
1478 "meeting",
1479 "order",
1480 "ticket",
1481 "transaction",
1482 ] {
1483 assert!(store.type_shards(t), "{t} should shard");
1484 }
1485 for t in [
1486 "contact",
1487 "company",
1488 "decision",
1489 "wiki-page",
1490 "index",
1491 "log",
1492 "db-md",
1493 "proposal",
1494 ] {
1495 assert!(!store.type_shards(t), "{t} should stay flat");
1496 }
1497 }
1498
1499 #[test]
1500 fn type_shards_respects_schema_directive_both_directions() {
1501 use crate::parser::{Config, Schema};
1502 let dir = empty_store();
1503 let mut store = open(&dir);
1504 let mut config = Config::default();
1505 // A CUSTOM type (not in the built-in list) opts into date-sharding —
1506 // without the schema override `type_shards` would return false for it.
1507 config.schemas.insert(
1508 "shipment".to_string(),
1509 Schema {
1510 shard: Some(true),
1511 ..Schema::default()
1512 },
1513 );
1514 // A BUILT-IN event type opts OUT (flat) — the override wins over the
1515 // built-in default.
1516 config.schemas.insert(
1517 "expense".to_string(),
1518 Schema {
1519 shard: Some(false),
1520 ..Schema::default()
1521 },
1522 );
1523 // A schema with no `shard:` directive leaves the built-in default intact.
1524 config
1525 .schemas
1526 .insert("meeting".to_string(), Schema::default());
1527 store.config = config;
1528
1529 assert!(
1530 store.type_shards("shipment"),
1531 "custom type with `shard: by-date` must shard"
1532 );
1533 assert!(
1534 !store.type_shards("expense"),
1535 "built-in event type with `shard: flat` must go flat"
1536 );
1537 assert!(
1538 store.type_shards("meeting"),
1539 "schema without a `shard:` directive keeps the built-in default"
1540 );
1541 assert!(
1542 !store.type_shards("contact"),
1543 "unconfigured entity type stays flat"
1544 );
1545 }
1546
1547 // ── shard_path_for ───────────────────────────────────────────────────────
1548
1549 fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
1550 let mut fm = Frontmatter::default();
1551 fm.extra.insert(
1552 key.to_string(),
1553 serde_norway::Value::String(value.to_string()),
1554 );
1555 fm
1556 }
1557
1558 fn fm_with_created(rfc3339: &str) -> Frontmatter {
1559 Frontmatter {
1560 created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
1561 ..Default::default()
1562 }
1563 }
1564
1565 #[test]
1566 fn shard_path_uses_primary_date_field_per_type() {
1567 let dir = empty_store();
1568 let store = open(&dir);
1569
1570 // expense.date → records/expenses/<YYYY>/<MM>/
1571 let p = store
1572 .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
1573 .unwrap();
1574 assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
1575
1576 // email.date → sources/emails/<YYYY>/<MM>/
1577 let p = store
1578 .shard_path_for(
1579 "email",
1580 &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
1581 "e1",
1582 )
1583 .unwrap();
1584 assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
1585
1586 // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
1587 let p = store
1588 .shard_path_for(
1589 "transcript",
1590 &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
1591 "t1",
1592 )
1593 .unwrap();
1594 assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
1595 }
1596
1597 #[test]
1598 fn shard_path_falls_back_to_created() {
1599 let dir = empty_store();
1600 let store = open(&dir);
1601 // meeting with no `date` field but a `created` timestamp.
1602 let p = store
1603 .shard_path_for(
1604 "meeting",
1605 &fm_with_created("2024-07-09T08:30:00-04:00"),
1606 "sync",
1607 )
1608 .unwrap();
1609 assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
1610 }
1611
1612 #[test]
1613 fn shard_path_primary_field_wins_over_created() {
1614 let dir = empty_store();
1615 let store = open(&dir);
1616 let mut fm = fm_with_created("2020-01-01T00:00:00Z");
1617 fm.extra.insert(
1618 "date".into(),
1619 serde_norway::Value::String("2026-05-22".into()),
1620 );
1621 let p = store.shard_path_for("expense", &fm, "x").unwrap();
1622 // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
1623 assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
1624 }
1625
1626 #[test]
1627 fn shard_path_flat_types_have_no_shard_segment() {
1628 let dir = empty_store();
1629 let store = open(&dir);
1630 // A contact has a `created` date, but contacts stay flat.
1631 let p = store
1632 .shard_path_for(
1633 "contact",
1634 &fm_with_created("2026-05-22T00:00:00Z"),
1635 "sarah-chen",
1636 )
1637 .unwrap();
1638 assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
1639
1640 // wiki-page is flat (no date shard) but still files under a type-folder:
1641 // `wiki/topics/<name>.md`, NEVER flat as `wiki/<name>.md`. A 2-component
1642 // path is invisible to the index/validate type-folder model.
1643 let p = store
1644 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1645 .unwrap();
1646 assert_eq!(p, PathBuf::from("wiki/topics/renewal-theme.md"));
1647 }
1648
1649 /// Regression: a wiki-page written through the toolkit's own path
1650 /// computation must land at a path the index + validate type-folder model
1651 /// accepts. `shard_path_for("wiki-page", …)` previously returned a
1652 /// 2-component `wiki/<file>` path, which `type_folder_of` (in both `index`
1653 /// and `validate`) treats as "no type-folder" — so the page either crashed
1654 /// `Index::on_write` (it tried to create `index.md` inside a file) or was
1655 /// silently dropped from every catalog by `Index::rebuild_all`. The
1656 /// computed path must have 3 components: `<layer>/<type-folder>/<file>`.
1657 #[test]
1658 fn shard_path_wiki_page_is_indexable_three_component_path() {
1659 let dir = empty_store();
1660 let store = open(&dir);
1661 let p = store
1662 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1663 .unwrap();
1664 // First two components are a layer + a non-empty type-folder segment;
1665 // the file is the third. This is exactly the shape `type_folder_of`
1666 // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
1667 let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
1668 assert_eq!(
1669 comps.len(),
1670 3,
1671 "wiki-page path must be <layer>/<type-folder>/<file>, got {p:?}"
1672 );
1673 assert_eq!(comps[0], "wiki", "first component must be the wiki layer");
1674 assert!(
1675 !comps[1].is_empty() && comps[1] != "renewal-theme.md",
1676 "second component must be a real type-folder, not the file: {p:?}"
1677 );
1678 assert!(
1679 comps[2].ends_with(".md"),
1680 "third component must be the .md file: {p:?}"
1681 );
1682 }
1683
1684 #[test]
1685 fn shard_path_preserves_and_adds_md_extension() {
1686 let dir = empty_store();
1687 let store = open(&dir);
1688 let with = store
1689 .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
1690 .unwrap();
1691 let without = store
1692 .shard_path_for("contact", &Frontmatter::default(), "sarah")
1693 .unwrap();
1694 assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
1695 assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
1696 }
1697
1698 #[test]
1699 fn shard_path_errors_when_sharding_type_has_no_date() {
1700 let dir = empty_store();
1701 let store = open(&dir);
1702 // expense shards, but no `date` and no `created` → NoShardDate.
1703 let err = store
1704 .shard_path_for("expense", &Frontmatter::default(), "mystery")
1705 .unwrap_err();
1706 match err {
1707 StoreError::NoShardDate { file } => {
1708 assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
1709 }
1710 other => panic!("expected NoShardDate, got {other:?}"),
1711 }
1712 }
1713
1714 // ── find_links_to ────────────────────────────────────────────────────────
1715
1716 #[test]
1717 fn find_links_to_matches_all_accepted_spellings() {
1718 let dir = empty_store();
1719 let root = dir.path();
1720 let target = "records/contacts/sarah-chen";
1721
1722 // Plain link.
1723 write(
1724 root,
1725 "wiki/people/sarah.md",
1726 &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1727 );
1728 // Link with display text.
1729 write(
1730 root,
1731 "records/meetings/2026/05/m.md",
1732 &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
1733 );
1734 // Link with .md extension (accepted, warned by validate).
1735 write(
1736 root,
1737 "wiki/themes/t.md",
1738 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}.md]]\n"),
1739 );
1740 // A catalog/index file also contains the link literally — included.
1741 write(
1742 root,
1743 "records/contacts/index.md",
1744 &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
1745 );
1746 // No link to the target.
1747 write(
1748 root,
1749 "wiki/people/elena.md",
1750 "---\ntype: wiki-page\nsummary: s\n---\nNo links here.\n",
1751 );
1752 // Short-form link must NOT match the full-path target.
1753 write(
1754 root,
1755 "wiki/people/bob.md",
1756 "---\ntype: wiki-page\nsummary: s\n---\n[[sarah-chen]]\n",
1757 );
1758 // A longer path that merely starts with the target must NOT match
1759 // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
1760 write(
1761 root,
1762 "wiki/people/jr.md",
1763 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}-jr]]\n"),
1764 );
1765
1766 let store = open(&dir);
1767 let got = rels(&store.find_links_to(Path::new(target)).unwrap());
1768 assert_eq!(
1769 got,
1770 vec![
1771 "records/contacts/index.md".to_string(),
1772 "records/meetings/2026/05/m.md".to_string(),
1773 "wiki/people/sarah.md".to_string(),
1774 "wiki/themes/t.md".to_string(),
1775 ]
1776 );
1777 }
1778
1779 #[test]
1780 fn find_links_to_distinguishes_sibling_paths() {
1781 // Two contacts whose paths share a prefix; a link to one must not be
1782 // reported as a link to the other.
1783 let dir = empty_store();
1784 let root = dir.path();
1785 write(
1786 root,
1787 "wiki/a.md",
1788 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah]]\n",
1789 );
1790 write(
1791 root,
1792 "wiki/b.md",
1793 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1794 );
1795 let store = open(&dir);
1796
1797 assert_eq!(
1798 rels(
1799 &store
1800 .find_links_to(Path::new("records/contacts/sarah"))
1801 .unwrap()
1802 ),
1803 vec!["wiki/a.md".to_string()]
1804 );
1805 assert_eq!(
1806 rels(
1807 &store
1808 .find_links_to(Path::new("records/contacts/sarah-chen"))
1809 .unwrap()
1810 ),
1811 vec!["wiki/b.md".to_string()]
1812 );
1813 }
1814
1815 // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
1816
1817 /// The working-set validate's incoming-linker discovery runs through
1818 /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
1819 /// batch contract that makes that single-pass behavior correct: the result is
1820 /// the union of incoming linkers across every target, with per-target
1821 /// boundary correctness preserved (no alternation arm bleeds into a
1822 /// prefix-sharing sibling). If a regression reverts the batch finder to a
1823 /// per-object loop, the union below would still hold — but the boundary +
1824 /// union-equivalence assertions are what guard the *correctness* of folding N
1825 /// scans into one regex.
1826 #[test]
1827 fn find_links_to_any_returns_the_union_with_boundary_correctness() {
1828 let dir = empty_store();
1829 let root = dir.path();
1830
1831 // Two distinct targets, each with its own linker.
1832 write(
1833 root,
1834 "wiki/links-sarah.md",
1835 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1836 );
1837 write(
1838 root,
1839 "wiki/links-acme.md",
1840 "---\ntype: wiki-page\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
1841 );
1842 // One file links to BOTH targets — must appear exactly once (deduped),
1843 // proving the per-file early-exit folds multiple-target hits into a
1844 // single result row rather than one row per matched target.
1845 write(
1846 root,
1847 "records/meetings/2026/05/m.md",
1848 "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
1849 [[records/companies/acme]]\n",
1850 );
1851 // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
1852 // be reported as a link to `sarah-chen` even though the alternation now
1853 // carries `sarah-chen` as one arm.
1854 write(
1855 root,
1856 "wiki/links-jr.md",
1857 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
1858 );
1859 // A file that links to neither requested target.
1860 write(
1861 root,
1862 "wiki/unrelated.md",
1863 "---\ntype: wiki-page\nsummary: s\n---\n[[wiki/themes/spend]]\n",
1864 );
1865
1866 let store = open(&dir);
1867 let targets = vec![
1868 PathBuf::from("records/contacts/sarah-chen"),
1869 PathBuf::from("records/companies/acme"),
1870 ];
1871
1872 let got = rels(&store.find_links_to_any(&targets).unwrap());
1873 assert_eq!(
1874 got,
1875 vec![
1876 "records/meetings/2026/05/m.md".to_string(),
1877 "wiki/links-acme.md".to_string(),
1878 "wiki/links-sarah.md".to_string(),
1879 ],
1880 "batch finder must return the deduped union of linkers across all \
1881 targets, excluding the prefix-sibling and the unrelated file"
1882 );
1883
1884 // Equivalence: the batch result must equal the union of the per-target
1885 // single finder. This is the property the working-set path relies on
1886 // when it folds one-scan-per-object into one scan for the whole set.
1887 let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
1888 for t in &targets {
1889 for linker in store.find_links_to(t).unwrap() {
1890 union.insert(linker);
1891 }
1892 }
1893 assert_eq!(
1894 rels(&union.into_iter().collect::<Vec<_>>()),
1895 got,
1896 "find_links_to_any must equal the union of per-target find_links_to"
1897 );
1898 }
1899
1900 /// An empty target set must scan nothing and find nothing — and crucially
1901 /// must NOT compile to a match-everything empty regex (which would report
1902 /// every `.md` as a linker). This is the empty-working-set fast path the
1903 /// `validate` loop hits when nothing changed.
1904 #[test]
1905 fn find_links_to_any_empty_targets_matches_nothing() {
1906 let dir = empty_store();
1907 let root = dir.path();
1908 write(
1909 root,
1910 "wiki/a.md",
1911 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1912 );
1913 let store = open(&dir);
1914
1915 assert!(
1916 store.find_links_to_any(&[]).unwrap().is_empty(),
1917 "no targets ⇒ no linkers (an empty pattern must not match every file)"
1918 );
1919 // A set of only empty/non-link targets is likewise a no-op, not a
1920 // match-everything.
1921 assert!(
1922 store
1923 .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
1924 .unwrap()
1925 .is_empty(),
1926 "targets that render to empty link text contribute no alternation arm"
1927 );
1928 }
1929
1930 // ── read_type_index ──────────────────────────────────────────────────────
1931
1932 #[test]
1933 fn read_type_index_parses_records_and_flattens_fields() {
1934 let dir = empty_store();
1935 let root = dir.path();
1936 let jsonl = "\
1937{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
1938{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
1939";
1940 let p = write(root, "records/expenses/index.jsonl", jsonl);
1941 let store = open(&dir);
1942 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1943
1944 assert_eq!(recs.len(), 2);
1945 // Sorted by path asc.
1946 assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
1947 assert_eq!(recs[0].type_, "expense");
1948 assert_eq!(recs[0].summary, "lunch");
1949 assert_eq!(recs[0].tags, vec!["meals".to_string()]);
1950 assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
1951 assert!(recs[0].created.is_some());
1952 // Extra (non-typed) frontmatter flattens into `fields`.
1953 assert_eq!(
1954 recs[0].fields.get("vendor"),
1955 Some(&serde_json::json!("acme"))
1956 );
1957 assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
1958 // Defaults: missing tags/links → empty.
1959 assert!(recs[1].tags.is_empty());
1960 assert!(recs[1].links.is_empty());
1961 }
1962
1963 #[test]
1964 fn read_type_index_last_write_wins_and_skips_blanks() {
1965 let dir = empty_store();
1966 let root = dir.path();
1967 // Same path twice; the second line supersedes the first. A blank line
1968 // in between must be ignored, not error.
1969 let jsonl = "\
1970{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
1971
1972{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
1973";
1974 let p = write(root, "records/contacts/index.jsonl", jsonl);
1975 let store = open(&dir);
1976 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1977 assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
1978 assert_eq!(recs[0].summary, "new", "later line must win");
1979 }
1980
1981 #[test]
1982 fn read_type_index_errors_on_malformed_line() {
1983 let dir = empty_store();
1984 let root = dir.path();
1985 let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
1986 let store = open(&dir);
1987 let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
1988 assert!(matches!(err, StoreError::BadTypeIndex { .. }));
1989 }
1990
1991 // ── find_by_type / find_by_where ─────────────────────────────────────────
1992
1993 fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
1994 format!(
1995 "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
1996 )
1997 }
1998
1999 #[test]
2000 fn find_by_type_reads_canonical_folder_sidecar() {
2001 let dir = empty_store();
2002 let root = dir.path();
2003 // Canonical folder for `contact` is records/contacts.
2004 write(
2005 root,
2006 "records/contacts/index.jsonl",
2007 &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
2008 + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
2009 );
2010 // A different type's sidecar must not leak into a contact query.
2011 write(
2012 root,
2013 "records/companies/index.jsonl",
2014 &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2015 );
2016 let store = open(&dir);
2017 let recs = store.find_by_type("contact").unwrap();
2018 let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
2019 assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
2020 assert!(recs.iter().all(|r| r.type_ == "contact"));
2021 }
2022
2023 #[test]
2024 fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
2025 let dir = empty_store();
2026 let root = dir.path();
2027 // A custom `proposal` record filed under a non-canonical folder NAME
2028 // (the natural plural `records/proposals/`) inside the records layer.
2029 // `default_type_folder("proposal")` = `records/proposal` (bare type, no
2030 // pluralization guess), so the canonical sidecar does not exist and
2031 // `find_by_type` falls back. The fallback is bounded to the type's
2032 // layer (records), so this record — same layer, non-canonical folder —
2033 // is still found: completeness within the layer holds.
2034 write(
2035 root,
2036 "records/proposals/index.jsonl",
2037 &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
2038 );
2039 // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
2040 // old whole-store fallback read every sidecar in the store and would
2041 // have leaked this into the result; the layer-bounded fallback must not.
2042 // It also pins that the fallback is O(entities-in-layer), never O(store).
2043 write(
2044 root,
2045 "sources/proposals/index.jsonl",
2046 &jsonl_line(
2047 "sources/proposals/leak.md",
2048 "proposal",
2049 "cross-layer decoy",
2050 "",
2051 ),
2052 );
2053 let store = open(&dir);
2054 let recs = store.find_by_type("proposal").unwrap();
2055 assert_eq!(
2056 recs.len(),
2057 1,
2058 "only the records-layer proposal, not the sources decoy"
2059 );
2060 assert_eq!(recs[0].summary, "Q3 proposal");
2061 assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
2062 }
2063
2064 #[test]
2065 fn find_by_type_canonical_absent_does_not_read_other_layers() {
2066 let dir = empty_store();
2067 let root = dir.path();
2068 // `email`'s canonical folder is `sources/emails` (layer Sources). No
2069 // sidecar there yet, so `find_by_type("email")` falls back — but only
2070 // within the Sources layer. A populated sidecar in the Records layer
2071 // must never be touched: the fallback is layer-bounded, not store-wide.
2072 // Under the old `read_all_type_indexes_in(None)` fallback this records
2073 // sidecar would have been read and filtered (wasted O(store) I/O); now
2074 // it is outside the walk root entirely.
2075 write(
2076 root,
2077 "records/contacts/index.jsonl",
2078 &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2079 );
2080 let store = open(&dir);
2081 // No email anywhere ⇒ empty, and the records layer was not in scope.
2082 assert!(store.find_by_type("email").unwrap().is_empty());
2083 }
2084
2085 #[test]
2086 fn find_by_where_matches_typed_columns_and_flat_fields() {
2087 let dir = empty_store();
2088 let root = dir.path();
2089 write(
2090 root,
2091 "records/expenses/index.jsonl",
2092 &(jsonl_line(
2093 "records/expenses/a.md",
2094 "expense",
2095 "lunch",
2096 ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
2097 ) + &jsonl_line(
2098 "records/expenses/b.md",
2099 "expense",
2100 "taxi",
2101 ",\"vendor\":\"yellow\"",
2102 )),
2103 );
2104 write(
2105 root,
2106 "records/contacts/index.jsonl",
2107 &jsonl_line(
2108 "records/contacts/sarah.md",
2109 "contact",
2110 "Sarah",
2111 ",\"tags\":[\"customer\"]",
2112 ),
2113 );
2114 let store = open(&dir);
2115
2116 // Flat field in `fields`.
2117 let by_vendor = store.find_by_where("vendor", "acme").unwrap();
2118 assert_eq!(by_vendor.len(), 1);
2119 assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
2120
2121 // Typed column: type (spans both expense records).
2122 assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
2123
2124 // Typed list column: tags membership.
2125 let customers = store.find_by_where("tags", "customer").unwrap();
2126 assert_eq!(customers.len(), 1);
2127 assert_eq!(
2128 customers[0].path,
2129 PathBuf::from("records/contacts/sarah.md")
2130 );
2131
2132 // No match → empty.
2133 assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2134 }
2135
2136 #[test]
2137 fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2138 let dir = empty_store();
2139 let root = dir.path();
2140 // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2141 // serialized from such a file preserves it verbatim.
2142 write(
2143 root,
2144 "records/meetings/index.jsonl",
2145 "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2146\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2147\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2148 );
2149 let store = open(&dir);
2150
2151 // The exact value an agent reads out of the file (`Z` form) must match.
2152 let by_z = store
2153 .find_by_where("created", "2026-05-01T00:00:00Z")
2154 .unwrap();
2155 assert_eq!(by_z.len(), 1);
2156 assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2157
2158 // The equivalent explicit-offset spelling of the same instant matches too.
2159 assert_eq!(
2160 store
2161 .find_by_where("created", "2026-05-01T00:00:00+00:00")
2162 .unwrap()
2163 .len(),
2164 1
2165 );
2166
2167 // A non-UTC stored value matches both its own offset spelling and the
2168 // same instant expressed as `Z` (instant comparison, not string compare).
2169 assert_eq!(
2170 store
2171 .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2172 .unwrap()
2173 .len(),
2174 1
2175 );
2176 assert_eq!(
2177 store
2178 .find_by_where("updated", "2026-05-02T16:30:00Z")
2179 .unwrap()
2180 .len(),
2181 1
2182 );
2183
2184 // A different instant does not match.
2185 assert!(store
2186 .find_by_where("created", "2026-05-01T00:00:01Z")
2187 .unwrap()
2188 .is_empty());
2189 // A non-RFC3339 query value never matches a real timestamp.
2190 assert!(store
2191 .find_by_where("created", "2026-05-01")
2192 .unwrap()
2193 .is_empty());
2194 }
2195
2196 #[test]
2197 fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
2198 // The O(entities-in-layer) contract: a layer-scoped where read must walk
2199 // ONLY the named layer's subtree. Proven structurally — a *malformed*
2200 // sidecar in another layer would make `read_type_index` error if it were
2201 // read, so a scoped read that succeeds (and excludes that record) is
2202 // proof the other layer's I/O never happened.
2203 let dir = empty_store();
2204 let root = dir.path();
2205 write(
2206 root,
2207 "records/companies/index.jsonl",
2208 &jsonl_line(
2209 "records/companies/acme.md",
2210 "company",
2211 "Acme",
2212 ",\"domain\":\"acme.com\"",
2213 ),
2214 );
2215 // Same field/value in the sources layer — but the sidecar is corrupt.
2216 write(
2217 root,
2218 "sources/emails/index.jsonl",
2219 "{ this is not valid json and would error if read }\n",
2220 );
2221 let store = open(&dir);
2222
2223 // Scoped to records: the corrupt sources sidecar is out of scope, so the
2224 // read succeeds and returns only the records-layer match.
2225 let in_records = store
2226 .find_by_where_in("domain", "acme.com", Some(Layer::Records))
2227 .expect("a records-scoped read must not touch the sources sidecar");
2228 assert_eq!(
2229 rels(
2230 &in_records
2231 .iter()
2232 .map(|r| r.path.clone())
2233 .collect::<Vec<_>>()
2234 ),
2235 vec!["records/companies/acme.md".to_string()]
2236 );
2237
2238 // The store-wide read DOES reach the corrupt sidecar and surfaces it as
2239 // a parse error — confirming the corrupt file is genuinely in the tree
2240 // and that only the layer scope spares it.
2241 let store_wide = store.find_by_where("domain", "acme.com");
2242 assert!(
2243 matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
2244 "unscoped read walks every layer and hits the corrupt sidecar"
2245 );
2246
2247 // Scoping to the layer that holds only the corrupt sidecar still errors
2248 // (the scope includes it), proving the scope is a real subtree bound and
2249 // not a silent "skip anything that fails".
2250 let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
2251 assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
2252 }
2253
2254 #[test]
2255 fn find_by_where_in_missing_layer_is_empty_not_an_error() {
2256 // A layer-scoped read over a layer folder that does not exist yet must
2257 // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
2258 // error from `ignore` over a nonexistent path.
2259 let dir = empty_store();
2260 let root = dir.path();
2261 write(
2262 root,
2263 "records/contacts/index.jsonl",
2264 &jsonl_line(
2265 "records/contacts/sarah.md",
2266 "contact",
2267 "Sarah",
2268 ",\"city\":\"denver\"",
2269 ),
2270 );
2271 let store = open(&dir);
2272
2273 // `wiki/` was never created.
2274 let in_wiki = store
2275 .find_by_where_in("city", "denver", Some(Layer::Wiki))
2276 .expect("missing layer subtree is empty, not an error");
2277 assert!(in_wiki.is_empty());
2278
2279 // Same query scoped to the layer that has the record still finds it.
2280 let in_records = store
2281 .find_by_where_in("city", "denver", Some(Layer::Records))
2282 .unwrap();
2283 assert_eq!(in_records.len(), 1);
2284 }
2285
2286 // ── abs_path / rel_path ──────────────────────────────────────────────────
2287
2288 #[test]
2289 fn abs_and_rel_path_roundtrip() {
2290 let dir = empty_store();
2291 let store = open(&dir);
2292 let rel = Path::new("records/contacts/sarah.md");
2293 let abs = store.abs_path(rel);
2294 assert_eq!(abs, dir.path().join(rel));
2295 assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
2296
2297 // An absolute path is passed through unchanged by abs_path.
2298 assert_eq!(store.abs_path(&abs), abs);
2299
2300 // A path outside the store has no store-relative form.
2301 assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
2302 }
2303
2304 // ── infer_type_from_path (inverse of default_type_folder) ────────────────
2305
2306 #[test]
2307 fn infer_type_maps_every_recognized_folder_back_to_its_type() {
2308 let cases = [
2309 ("sources/emails/x.md", "email"),
2310 ("sources/transcripts/x.md", "transcript"),
2311 ("sources/docs/x.md", "pdf-source"),
2312 ("records/contacts/x.md", "contact"),
2313 ("records/companies/x.md", "company"),
2314 ("records/expenses/x.md", "expense"),
2315 ("records/meetings/x.md", "meeting"),
2316 ("records/decisions/x.md", "decision"),
2317 ("records/invoices/x.md", "invoice"),
2318 // Any wiki sub-folder infers `wiki-page` regardless of the topic name.
2319 ("wiki/topics/x.md", "wiki-page"),
2320 ("wiki/pricing/x.md", "wiki-page"),
2321 ];
2322 for (path, expected) in cases {
2323 assert_eq!(
2324 infer_type_from_path(Path::new(path)).as_deref(),
2325 Some(expected),
2326 "path {path} should infer type {expected}"
2327 );
2328 }
2329 }
2330
2331 #[test]
2332 fn infer_type_round_trips_with_default_type_folder() {
2333 // The canonical invariant: inference is the inverse of the forward map.
2334 // Every recognized type, routed through `default_type_folder` and then
2335 // back through `infer_type_from_path`, must return the original type.
2336 // `wiki-page` is the one many-to-one case (every topic folder maps back
2337 // to `wiki-page`), so its forward folder still round-trips.
2338 let recognized = [
2339 "email",
2340 "transcript",
2341 "pdf-source",
2342 "contact",
2343 "company",
2344 "expense",
2345 "meeting",
2346 "decision",
2347 "invoice",
2348 "wiki-page",
2349 ];
2350 for type_ in recognized {
2351 let folder = default_type_folder(type_);
2352 let file = folder.join("x.md");
2353 assert_eq!(
2354 infer_type_from_path(&file).as_deref(),
2355 Some(type_),
2356 "recognized type {type_} (folder {folder:?}) must round-trip"
2357 );
2358 }
2359 }
2360
2361 #[test]
2362 fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
2363 // Regression guard for the CLI/core divergence: `default_type_folder`'s
2364 // unrecognized fallback is the BARE type name (`task → records/task`,
2365 // `tasks → records/tasks`). Inference must NOT singularize, or a custom
2366 // type would not round-trip (e.g. `records/tasks` → `task` would clash
2367 // with `default_type_folder("task") → records/task`).
2368 for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
2369 let folder = default_type_folder(custom);
2370 assert_eq!(folder, PathBuf::from("records").join(custom));
2371 let file = folder.join("x.md");
2372 assert_eq!(
2373 infer_type_from_path(&file).as_deref(),
2374 Some(custom),
2375 "custom type {custom} must round-trip verbatim (no singularization)"
2376 );
2377 }
2378
2379 // The specific case named in the finding: a plural custom folder keeps
2380 // its trailing `s`; it is NOT singularized to `task`.
2381 assert_eq!(
2382 infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
2383 Some("tasks"),
2384 "records/tasks must infer `tasks`, not `task`"
2385 );
2386 }
2387
2388 #[test]
2389 fn infer_type_requires_three_component_layer_folder_file_shape() {
2390 // Fewer than 3 components: a file directly under a layer has no
2391 // type-folder, so inference yields None (matches the old CLI contract).
2392 assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
2393 assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
2394 assert_eq!(infer_type_from_path(Path::new("wiki/x.md")), None);
2395 assert_eq!(infer_type_from_path(Path::new("x.md")), None);
2396 // Unknown leading layer is never inferred.
2397 assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
2398 // Deeper paths still infer from the first type-folder segment (e.g. a
2399 // sharded record under records/expenses/2026/05/x.md).
2400 assert_eq!(
2401 infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
2402 Some("expense"),
2403 );
2404 }
2405}