dbmd_core/store.rs
1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (embedded ripgrep,
12//! presence-only) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18
19use std::collections::BTreeMap;
20use std::path::{Path, PathBuf};
21
22use chrono::{DateTime, Datelike, FixedOffset};
23use grep::regex::RegexMatcher;
24use grep::searcher::sinks::Lossy;
25use grep::searcher::Searcher;
26use ignore::WalkBuilder;
27
28use crate::index::IndexRecord;
29use crate::parser::{parse_db_md, Config, Frontmatter};
30
31/// Basenames that are never content files: the config marker and the two
32/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
33/// content layers never mistakes a catalog for a record.
34///
35/// Only `index.md` is excluded by basename, because the content walks traverse
36/// the layer dirs (`sources/`/`records/`/`wiki/`) and `index.md` is the only
37/// meta file that appears INSIDE them. The root `DB.md` / `log.md` (and the
38/// `log/` archive) live at the store root, outside every layer, so they are
39/// never reached by these walks — and a content file that merely happens to be
40/// named `DB.md` or `log.md` inside a layer (e.g. `records/docs/DB.md`) is real
41/// content the SPEC does NOT reserve at type-folder depth.
42const NON_CONTENT_BASENAMES: [&str; 1] = ["index.md"];
43
44/// The complete machine-twin sidecar that backs every structured read.
45const TYPE_INDEX_FILE: &str = "index.jsonl";
46
47/// Returned when a path is opened as a store but has no `DB.md` at its root.
48/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
49#[derive(Debug, thiserror::Error)]
50#[error("not a db.md store: {path} has no DB.md")]
51pub struct NotAStore {
52 /// The path that was inspected.
53 pub path: PathBuf,
54}
55
56/// Errors from store-level operations (walk, locate, shard, sidecar read).
57#[derive(Debug, thiserror::Error)]
58pub enum StoreError {
59 /// A sidecar `index.jsonl` could not be read or parsed.
60 #[error("failed to read type index {path}: {message}")]
61 BadTypeIndex {
62 /// The sidecar file.
63 path: PathBuf,
64 /// What went wrong.
65 message: String,
66 },
67
68 /// A required date field for sharding was absent or unparseable, and there
69 /// was no usable fallback.
70 #[error("cannot compute shard path for {file}: no usable date field")]
71 NoShardDate {
72 /// The file being placed.
73 file: PathBuf,
74 },
75
76 /// An embedded-ripgrep scan failed to start or run.
77 #[error("search failed under {root}: {message}")]
78 Search {
79 /// The root the scan ran under.
80 root: PathBuf,
81 /// What went wrong.
82 message: String,
83 },
84
85 /// An underlying I/O failure.
86 #[error(transparent)]
87 Io(#[from] std::io::Error),
88}
89
90/// The three canonical layers of a db.md store.
91///
92/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
93/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
94/// declaration order (`Sources` < `Records` < `Wiki`) is the sort order.
95#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
96pub enum Layer {
97 /// `sources/` — raw evidence; immutable; date-sharded at scale.
98 Sources,
99 /// `records/` — atomic typed data; entity types flat, event types sharded.
100 Records,
101 /// `wiki/` — curator-synthesized narrative; flat.
102 Wiki,
103}
104
105impl Layer {
106 /// The on-disk folder name for this layer (`"sources"` / `"records"` /
107 /// `"wiki"`).
108 pub fn dir_name(self) -> &'static str {
109 match self {
110 Layer::Sources => "sources",
111 Layer::Records => "records",
112 Layer::Wiki => "wiki",
113 }
114 }
115
116 /// Parse a layer from its folder name; `None` for anything else.
117 pub fn from_dir_name(name: &str) -> Option<Self> {
118 match name {
119 "sources" => Some(Layer::Sources),
120 "records" => Some(Layer::Records),
121 "wiki" => Some(Layer::Wiki),
122 _ => None,
123 }
124 }
125
126 /// Every layer, in canonical order.
127 pub fn all() -> [Layer; 3] {
128 [Layer::Sources, Layer::Records, Layer::Wiki]
129 }
130}
131
132/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
133///
134/// Construct via [`Store::open`]; that is the only path in, and it validates
135/// the `DB.md` marker so downstream code can assume a real store.
136#[derive(Debug, Clone)]
137pub struct Store {
138 /// The store root (the directory containing `DB.md`).
139 pub root: PathBuf,
140 /// The parsed `DB.md` config (agent instructions, policies, schemas).
141 pub config: Config,
142}
143
144impl Store {
145 /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
146 /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
147 /// count (the lowercase name refers to the project/spec, not the marker).
148 pub fn is_db_md_store(path: &Path) -> bool {
149 // Read the directory and match the *stored* filename byte-for-byte.
150 // `path.join("DB.md").exists()` would lie on a case-insensitive
151 // filesystem (macOS default), where a lowercase `db.md` answers a
152 // `DB.md` probe. `read_dir` returns the real on-disk name, so the
153 // exact-match check is correct on both case-sensitive (Linux) and
154 // case-insensitive filesystems.
155 let entries = match std::fs::read_dir(path) {
156 Ok(entries) => entries,
157 Err(_) => return false,
158 };
159 for entry in entries.flatten() {
160 if entry.file_name() == "DB.md" {
161 // A directory literally named `DB.md` is not the marker.
162 match entry.file_type() {
163 Ok(ft) if ft.is_dir() => return false,
164 Ok(_) => return true,
165 Err(_) => return false,
166 }
167 }
168 }
169 false
170 }
171
172 /// Open `path` as a db.md store and require `DB.md` to be readable and
173 /// parseable. Normal commands should enter through this strict gate so a
174 /// damaged config cannot silently disable schema or policy rules.
175 pub fn open_strict(path: &Path) -> crate::Result<Store> {
176 if !Store::is_db_md_store(path) {
177 return Err(NotAStore {
178 path: path.to_path_buf(),
179 }
180 .into());
181 }
182 let db_md = path.join("DB.md");
183 let text = std::fs::read_to_string(&db_md)?;
184 let config = parse_db_md(&text, &db_md)?;
185 Ok(Store {
186 root: path.to_path_buf(),
187 config,
188 })
189 }
190
191 /// Open `path` as a db.md store: confirm the `DB.md` marker (else
192 /// [`NotAStore`]) and parse the `DB.md` config when possible. This is the
193 /// lenient validation-oriented open path: a damaged `DB.md` still marks the
194 /// directory as a store so `dbmd validate` can report the config error as an
195 /// issue. Normal CLI commands should use [`Store::open_strict`] instead.
196 pub fn open(path: &Path) -> Result<Store, NotAStore> {
197 if !Store::is_db_md_store(path) {
198 return Err(NotAStore {
199 path: path.to_path_buf(),
200 });
201 }
202 let db_md = path.join("DB.md");
203 // The marker exists; parse its config. A read or parse failure leaves
204 // the store openable with default config rather than masquerading as
205 // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
206 // DB.md is `dbmd validate`'s job to report, not `open`'s.
207 let config = match std::fs::read_to_string(&db_md) {
208 Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
209 Err(_) => Config::default(),
210 };
211 Ok(Store {
212 root: path.to_path_buf(),
213 config,
214 })
215 }
216
217 /// **SWEEP.** Recursively iterate every `.md` content file across
218 /// `sources/`, `records/`, and `wiki/`, skipping hidden dirs and `log/`.
219 /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
220 /// the interactive loop.
221 pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
222 // Only the three content layers — never root meta files (`DB.md`,
223 // `index.md`, `log.md`) and never `log/`, which live at root and are
224 // outside every layer dir.
225 let mut out = Vec::new();
226 for layer in Layer::all() {
227 out.extend(self.walk_layer(layer)?);
228 }
229 out.sort();
230 Ok(out)
231 }
232
233 /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
234 pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
235 let layer_root = self.root.join(layer.dir_name());
236 if !layer_root.is_dir() {
237 return Ok(Vec::new());
238 }
239 self.walk_content_md(&layer_root)
240 }
241
242 /// Enumerate every `.md` file in a single type-folder, **recursing through
243 /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
244 /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
245 pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
246 let abs = self.resolve_under_root(type_folder);
247 if !abs.is_dir() {
248 return Ok(Vec::new());
249 }
250 self.walk_content_md(&abs)
251 }
252
253 /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
254 /// (descending), ties broken by store-relative path (ascending) — a total
255 /// order, so write-through and rebuild never disagree on #500 vs #501.
256 ///
257 /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
258 /// `index rebuild`. The write-through path never calls this. The
259 /// cap-selection primitive for the 500-entry `index.md` browse view.
260 pub fn recent_in_type_folder(
261 &self,
262 type_folder: &Path,
263 n: usize,
264 ) -> Result<Vec<PathBuf>, StoreError> {
265 let files = self.walk_type_folder(type_folder)?;
266 // (updated, rel-path) for each file. Files missing/unparseable
267 // `updated` sort *after* dated ones (None last), then by path — so they
268 // are deterministically the lowest-priority candidates for the cap, not
269 // dropped silently. The total order (updated desc, path asc) is what
270 // keeps write-through and rebuild agreeing on #500 vs #501.
271 let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
272 .into_iter()
273 .map(|rel| {
274 let updated = self.read_updated(&self.abs_path(&rel));
275 (updated, rel)
276 })
277 .collect();
278 keyed.sort_by(|a, b| {
279 // `updated` descending: newest first. `None` is treated as the
280 // oldest possible, so dated files always win a cap slot over
281 // undated ones.
282 let by_updated = b.0.cmp(&a.0);
283 by_updated.then_with(|| a.1.cmp(&b.1))
284 });
285 keyed.truncate(n);
286 Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
287 }
288
289 /// The shard/flat predicate: true if the type date-shards, false if it
290 /// stays flat. True for source types and event record types
291 /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
292 /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
293 /// dedup-bounded entity types (`contact`/`company`/`decision`) and `wiki/`.
294 pub fn type_shards(&self, type_: &str) -> bool {
295 // A `DB.md ## Schemas` `### <type>` block with a `shard:` directive is
296 // authoritative — it is the v0.2 generic-model way to declare sharding,
297 // so it overrides the built-in default below (in either direction).
298 if let Some(shard) = self.config.schemas.get(type_).and_then(|s| s.shard) {
299 return shard;
300 }
301 // Built-in default for the example types. Sharding is a property of the
302 // *type*:
303 // - source types carry a primary date field and shard;
304 // - event record types track business volume and shard;
305 // - dedup-bounded entity types and curation-bounded wiki stay flat.
306 // Any type can override this via a `shard:` directive (above).
307 matches!(
308 type_,
309 // source types
310 "email" | "transcript" | "pdf-source"
311 // event record types (canonical)
312 | "expense" | "invoice" | "meeting"
313 // event record types (recognized custom, per the plan)
314 | "order" | "ticket" | "transaction"
315 )
316 }
317
318 /// Compute the canonical write path for a new file. For a sharding type
319 /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
320 /// primary date field (`email.date`, `expense.date`, … fallback `created`)
321 /// under the type folder; flat types and `wiki/` get no shard segment.
322 /// Deterministic + stable: same input → same path, so a record never moves
323 /// once written.
324 pub fn shard_path_for(
325 &self,
326 type_: &str,
327 frontmatter: &Frontmatter,
328 name: &str,
329 ) -> Result<PathBuf, StoreError> {
330 self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
331 }
332
333 /// Like [`Store::shard_path_for`], but compute the path under an explicit,
334 /// caller-resolved type-folder rather than the canonical default. This lets a
335 /// write surface honour an agent-supplied conforming sub-folder — e.g.
336 /// `wiki/projects/`, `wiki/people/`, `wiki/synthesis/` (the SPEC files a
337 /// `wiki-page` under `wiki/<topic>/`, i.e. ANY topic sub-folder, not only the
338 /// `wiki/topics` default) — while still applying date-sharding for sharding
339 /// types. The folder must be a conforming `<layer>/<type-folder>` (2
340 /// components, recognized layer); the caller is responsible for that (see the
341 /// CLI's `resolve_write_path`), so it is taken as given here.
342 ///
343 /// Sharding is still a property of the *type*: a sharding type gets the
344 /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
345 pub fn shard_path_in(
346 &self,
347 folder: &Path,
348 type_: &str,
349 frontmatter: &Frontmatter,
350 name: &str,
351 ) -> Result<PathBuf, StoreError> {
352 let folder = folder.to_path_buf();
353 let filename = ensure_md_extension(name);
354
355 if !self.type_shards(type_) {
356 // Flat type (entity records, wiki, decisions): no shard segment.
357 return Ok(folder.join(filename));
358 }
359
360 // Sharding type: derive <YYYY>/<MM> from the primary date field, with
361 // `created` as the universal fallback. Reading the public `Frontmatter`
362 // fields directly (typed `created`/`updated` + raw `extra`) avoids the
363 // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
364 let (year, month) = self
365 .primary_shard_segment(type_, frontmatter)
366 .ok_or_else(|| StoreError::NoShardDate {
367 file: folder.join(&filename),
368 })?;
369
370 Ok(folder.join(year).join(month).join(filename))
371 }
372
373 /// Find files with an incoming wiki-link to `target`, via **embedded
374 /// ripgrep** for `[[target]]` across all layers. Loop-fast; no whole-graph
375 /// build. Returns store-relative paths.
376 pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
377 // A single target is just the degenerate batch case — one alternation
378 // arm, one store scan. Routing through `find_links_to_any` keeps the
379 // pattern construction and the scan loop in exactly one place. The
380 // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
381 // single alloc on this single-target convenience path; the batch path
382 // validate.rs rides is untouched).
383 self.find_links_to_any(&[target.to_path_buf()])
384 }
385
386 /// Find every file with an incoming wiki-link to **any** of `targets`, in a
387 /// **single embedded-ripgrep pass** over the store (one `.md` walk, one
388 /// presence-only scan per file). This is the batch incoming-linker finder the
389 /// working-set [`crate::validate::validate_working_set`] sits on: it must find
390 /// the linkers for the *whole* changed set without paying a full store read
391 /// per changed object. Cost is therefore one store scan (O(store)), NOT
392 /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
393 /// in a loop would reread every `.md` once per target and is the exact
394 /// `O(changed × store)` blow-up this method exists to prevent. Returns
395 /// store-relative paths (deduped, sorted).
396 ///
397 /// Why content scan and not the sidecar `links` field: the sidecar projects
398 /// only the frontmatter `links:` array, so it misses edges written in the
399 /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
400 /// arbitrary path therefore requires reading file content — the same reason
401 /// the single-target finder uses ripgrep.
402 pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
403 // The wiki-link doctrine: a link is the full store-relative path, no
404 // `.md` extension. A reference to a target therefore appears literally
405 // as `[[<target>]]`, optionally with a `|display` suffix and (warned
406 // but accepted) a trailing `.md`. Build ONE regex that matches all
407 // accepted spellings of an incoming link to ANY target, escaping each
408 // target so path separators / dots stay literal and the alternation
409 // arms keep their boundaries (a link to `sarah` never matches
410 // `sarah-chen`).
411 let mut arms: Vec<String> = Vec::new();
412 for target in targets {
413 let target_str = path_to_link_str(target);
414 if target_str.is_empty() {
415 continue;
416 }
417 // [[ <target> (.md)? ( | display )? ]]
418 arms.push(format!(
419 r"\[\[{}(\.md)?(\|[^\]]*)?\]\]",
420 regex::escape(&target_str)
421 ));
422 }
423 // No usable targets → no possible incoming links, and an empty pattern
424 // would compile to a match-everything regex. Short-circuit instead.
425 if arms.is_empty() {
426 return Ok(Vec::new());
427 }
428 let pattern = arms.join("|");
429
430 let matcher = RegexMatcher::new(&pattern).map_err(|e| StoreError::Search {
431 root: self.root.clone(),
432 message: format!("invalid backlink pattern: {e}"),
433 })?;
434
435 let mut hits = std::collections::BTreeSet::new();
436 // Scan every `.md` file in the store (skip hidden + `log/`), including
437 // `index.md` catalogs — an incoming reference is wherever the literal
438 // link text lives; the caller decides relevance. ONE walk for the whole
439 // target set; per file we stop at the first hit (presence is all we
440 // need), so a file that links to several targets is read once, not once
441 // per target.
442 for rel in self.walk_all_md()? {
443 let abs = self.abs_path(&rel);
444 let mut matched_here = false;
445 let mut searcher = Searcher::new();
446 // `Lossy`, not `UTF8`: a `.md` file verbatim-ingested into
447 // `sources/` can carry a stray non-UTF-8 byte (e.g. a mis-decoded
448 // Latin-1 import). The `UTF8` sink runs `std::str::from_utf8` on
449 // each matched line and returns an `io::Error` on invalid bytes,
450 // which propagated out of `search_path` and aborted the *entire*
451 // store scan for every caller (`graph backlinks`, the working-set
452 // validate incoming-linker pass) — one bad byte on a single
453 // link-bearing line took the whole batch down. `Lossy` substitutes
454 // replacement characters instead of erroring; the closure ignores
455 // the line text entirely (presence is all we need), so the lossy
456 // conversion has no downside and the scan degrades to "still finds
457 // the link" rather than failing hard.
458 let res = searcher.search_path(
459 &matcher,
460 &abs,
461 Lossy(|_lnum, _line| {
462 matched_here = true;
463 // Stop at the first hit: presence is all we need.
464 Ok(false)
465 }),
466 );
467 if let Err(e) = res {
468 return Err(StoreError::Search {
469 root: self.root.clone(),
470 message: format!("search failed in {}: {e}", abs.display()),
471 });
472 }
473 if matched_here {
474 hits.insert(rel);
475 }
476 }
477 Ok(hits.into_iter().collect())
478 }
479
480 /// Candidate set for a `type` query: read every type-folder `index.jsonl`
481 /// sidecar in the type's single layer and return the records of that
482 /// `type`. Complete and cold-cache-proof — NOT a walk-and-parse or a
483 /// frontmatter ripgrep scan, and **never a store-wide read**.
484 ///
485 /// The read is bounded to the type's one layer subtree
486 /// (O(entities-in-layer)): a type lives in exactly one layer, and
487 /// `default_type_folder` always encodes it (recognized → its SPEC layer;
488 /// unrecognized → `records/`), so the walk never fans out across every
489 /// sidecar in the store and stays inside the interactive loop's
490 /// O(entities) contract.
491 ///
492 /// The whole-layer read — rather than reading only the type's canonical
493 /// folder sidecar when it happens to exist — is what makes the result
494 /// *complete*. A single `type` can legitimately be filed across several
495 /// folders within its layer: `wiki-page` under `wiki/<topic>/` for any
496 /// topic (SPEC), or a `contact` filed in `records/clients/` alongside the
497 /// canonical `records/contacts/`. The previous code read only the
498 /// canonical-guess sidecar whenever it was a file, which silently dropped
499 /// those non-canonical records the moment the canonical sidecar existed —
500 /// returning an incomplete set, and a *different* set as the store grew
501 /// (the omission flipped on once one canonical record was added). That
502 /// broke the dedup/enumeration premise this primitive backs and disagreed
503 /// with `find_by_where_in`, which already walks the whole layer. Filtering
504 /// the layer read by `type` keeps the result complete regardless of how the
505 /// type's records are foldered.
506 pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
507 let canonical_folder = default_type_folder(type_);
508 let records = self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?;
509 Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
510 }
511
512 /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
513 /// every type-folder `index.jsonl` sidecar and filter their records. The
514 /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
515 /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
516 pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
517 self.find_by_where_in(key, value, None)
518 }
519
520 /// Candidate set for a `key=value` frontmatter query, **scoped to one
521 /// layer** when `layer` is `Some`: the sidecar walk is confined to that
522 /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
523 /// not O(store records). `None` keeps the store-wide read.
524 ///
525 /// This is what makes `--in <layer>` an I/O scope, not just a result
526 /// filter: a `--where`-only query (no `--type`) used to read every sidecar
527 /// in the store and narrow by layer in memory, breaking the O(entities)
528 /// contract the interactive loop depends on. With a layer in hand we walk
529 /// only that layer's sidecars.
530 pub fn find_by_where_in(
531 &self,
532 key: &str,
533 value: &str,
534 layer: Option<Layer>,
535 ) -> Result<Vec<IndexRecord>, StoreError> {
536 // A `key=value` query can target any frontmatter field across any type,
537 // so within the chosen subtree we still read every type-folder sidecar
538 // and filter. The layer (when given) bounds *which* subtree, turning a
539 // whole-store walk into a single-layer walk.
540 let records = self.read_all_type_indexes_in(layer)?;
541 Ok(records
542 .into_iter()
543 .filter(|r| record_matches_field(r, key, value))
544 .collect())
545 }
546
547 /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
548 /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
549 /// else store-wide. Sequential, complete sidecar reads — never a
550 /// walk-and-parse of the content tree.
551 ///
552 /// This is the unfiltered sidecar-enumeration primitive the relationship
553 /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
554 /// candidate set to the relevant layer (or the whole store) without opening
555 /// the content tree, then confirms each candidate's edge by parsing the file.
556 pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
557 self.read_all_type_indexes_in(layer)
558 }
559
560 /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
561 /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
562 /// primitive every structured query sits on.
563 pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
564 let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
565 path: index_jsonl.to_path_buf(),
566 message: e.to_string(),
567 })?;
568
569 // Last-write-wins by `path` over un-compacted lines: a later line for
570 // the same path supersedes an earlier one (the jsonl is append-mostly
571 // and only compacted on rebuild). Blank lines are skipped; a non-blank
572 // line that is not a valid IndexRecord is a hard parse error.
573 let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
574 for (i, line) in text.lines().enumerate() {
575 let trimmed = line.trim();
576 if trimmed.is_empty() {
577 continue;
578 }
579 let record: IndexRecord =
580 serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
581 path: index_jsonl.to_path_buf(),
582 message: format!("line {}: {e}", i + 1),
583 })?;
584 by_path.insert(record.path.clone(), record);
585 }
586 // BTreeMap keyed by path → records emerge sorted by path ascending,
587 // a deterministic order independent of line order in the file.
588 Ok(by_path.into_values().collect())
589 }
590
591 /// Resolve a store-relative path to its absolute on-disk path under
592 /// [`root`](Store::root).
593 pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
594 // `Path::join` returns `store_relative` unchanged if it is already
595 // absolute, so passing an absolute path through is a no-op.
596 self.root.join(store_relative)
597 }
598
599 /// Convert an absolute path under the store into its store-relative form.
600 pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
601 abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
602 }
603
604 // ── Private helpers ─────────────────────────────────────────────────────
605
606 /// Resolve a caller-supplied folder path (store-relative or absolute) to an
607 /// absolute path under the store root.
608 fn resolve_under_root(&self, folder: &Path) -> PathBuf {
609 if folder.is_absolute() {
610 folder.to_path_buf()
611 } else {
612 self.root.join(folder)
613 }
614 }
615
616 /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
617 /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
618 /// and type-folder walks.
619 fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
620 let mut out = Vec::new();
621 for entry in self.md_walker(root).build() {
622 let entry = entry.map_err(|e| StoreError::Search {
623 root: root.to_path_buf(),
624 message: e.to_string(),
625 })?;
626 if !is_file_entry(&entry) {
627 continue;
628 }
629 let path = entry.path();
630 if !has_md_extension(path) {
631 continue;
632 }
633 if is_non_content_basename(path) {
634 continue;
635 }
636 if let Some(rel) = self.rel_path(path) {
637 out.push(rel);
638 }
639 }
640 out.sort();
641 Ok(out)
642 }
643
644 /// Walk the whole store for **every** `.md` file (including `index.md`),
645 /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
646 /// scan, where the literal link text can live in any markdown file.
647 fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
648 let mut out = Vec::new();
649 for entry in self.md_walker(&self.root).build() {
650 let entry = entry.map_err(|e| StoreError::Search {
651 root: self.root.clone(),
652 message: e.to_string(),
653 })?;
654 if !is_file_entry(&entry) {
655 continue;
656 }
657 let path = entry.path();
658 if !has_md_extension(path) {
659 continue;
660 }
661 if self.is_in_log_dir(path) {
662 continue;
663 }
664 if let Some(rel) = self.rel_path(path) {
665 out.push(rel);
666 }
667 }
668 out.sort();
669 Ok(out)
670 }
671
672 /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
673 /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
674 /// read with last-write-wins by path; across sidecars, paths are disjoint by
675 /// construction (one sidecar per folder), so a plain concatenation preserves
676 /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
677 /// is what keeps `find_by_where_in` O(entities-in-layer).
678 fn read_all_type_indexes_in(
679 &self,
680 layer: Option<Layer>,
681 ) -> Result<Vec<IndexRecord>, StoreError> {
682 let mut out = Vec::new();
683 for sidecar in self.find_type_index_files_in(layer)? {
684 out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
685 }
686 Ok(out)
687 }
688
689 /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
690 /// whole store (skip hidden + `log/`), returning store-relative paths. The
691 /// walk root is `<root>/<layer>/` for a scoped read and `self.root` for the
692 /// store-wide read; a non-existent layer subtree yields no sidecars rather
693 /// than walking a missing path.
694 fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
695 let walk_root = match layer {
696 Some(l) => self.root.join(l.dir_name()),
697 None => self.root.clone(),
698 };
699 // A scoped walk over a layer folder that does not exist yet must be an
700 // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
701 // error from `ignore` over a nonexistent path.
702 if !walk_root.is_dir() {
703 return Ok(Vec::new());
704 }
705 let mut out = Vec::new();
706 let mut builder = WalkBuilder::new(&walk_root);
707 builder.standard_filters(false).hidden(true);
708 for entry in builder.build() {
709 let entry = entry.map_err(|e| StoreError::Search {
710 root: walk_root.clone(),
711 message: e.to_string(),
712 })?;
713 if !is_file_entry(&entry) {
714 continue;
715 }
716 let path = entry.path();
717 if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
718 continue;
719 }
720 if self.is_in_log_dir(path) {
721 continue;
722 }
723 if let Some(rel) = self.rel_path(path) {
724 out.push(rel);
725 }
726 }
727 out.sort();
728 Ok(out)
729 }
730
731 /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
732 /// OFF (a SWEEP must see every file even if the store is a git repo with a
733 /// `.gitignore`), but hidden files/dirs are skipped.
734 fn md_walker(&self, root: &Path) -> WalkBuilder {
735 let mut builder = WalkBuilder::new(root);
736 builder.standard_filters(false).hidden(true);
737 builder
738 }
739
740 /// True if an absolute path lives under the store's root-level `log/`
741 /// rotation-archive directory.
742 fn is_in_log_dir(&self, abs: &Path) -> bool {
743 match self.rel_path(abs) {
744 Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
745 None => false,
746 }
747 }
748
749 /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
750 /// returning `None` when absent/unparseable. A self-contained reader (does
751 /// not depend on the not-yet-implemented `parser::read_file`); parses the
752 /// leading `---`-fenced YAML block with the same engine the parser uses.
753 fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
754 let text = std::fs::read_to_string(abs).ok()?;
755 let yaml = frontmatter_block(&text)?;
756 let value: serde_norway::Value = serde_norway::from_str(yaml).ok()?;
757 let raw = value.get("updated")?;
758 value_to_datetime(raw)
759 }
760
761 /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
762 /// date field with a `created` fallback. Reads the public `Frontmatter`
763 /// fields directly. `None` when no usable date is present.
764 fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
765 // Try the type's primary date field first.
766 if let Some(field) = primary_date_field(type_) {
767 if let Some(v) = fm.extra.get(field) {
768 if let Some(seg) = value_to_year_month(v) {
769 return Some(seg);
770 }
771 }
772 }
773 // Universal fallback: the typed `created` timestamp.
774 fm.created
775 .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
776 }
777}
778
779// ── Free helpers (no `self`) ────────────────────────────────────────────────
780
781/// True if a walk entry is a regular file (not a dir / symlink-to-dir).
782fn is_file_entry(entry: &ignore::DirEntry) -> bool {
783 entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
784}
785
786/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
787/// lowercase `.md`).
788fn has_md_extension(path: &Path) -> bool {
789 path.extension().and_then(|e| e.to_str()) == Some("md")
790}
791
792/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
793/// `log.md`) that the content walks must skip.
794fn is_non_content_basename(path: &Path) -> bool {
795 match path.file_name().and_then(|n| n.to_str()) {
796 Some(name) => NON_CONTENT_BASENAMES.contains(&name),
797 None => false,
798 }
799}
800
801/// Append `.md` to a bare name; leave an existing `.md` untouched.
802fn ensure_md_extension(name: &str) -> String {
803 if name.ends_with(".md") {
804 name.to_string()
805 } else {
806 format!("{name}.md")
807 }
808}
809
810/// Render a store-relative path as a wiki-link target string with `/`
811/// separators (never `\`), no leading `./`, no trailing `.md`.
812fn path_to_link_str(target: &Path) -> String {
813 let mut parts: Vec<String> = Vec::new();
814 for comp in target.components() {
815 if let std::path::Component::Normal(os) = comp {
816 if let Some(s) = os.to_str() {
817 parts.push(s.to_string());
818 }
819 }
820 }
821 let mut joined = parts.join("/");
822 if let Some(stripped) = joined.strip_suffix(".md") {
823 joined = stripped.to_string();
824 }
825 joined
826}
827
828/// The canonical default folder for a recognized type, per the SPEC type table
829/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
830/// types fall back to `records/<type>` (the bare type name, no pluralization
831/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
832fn default_type_folder(type_: &str) -> PathBuf {
833 let path = match type_ {
834 // sources
835 "email" => "sources/emails",
836 "transcript" => "sources/transcripts",
837 "pdf-source" => "sources/docs",
838 // records — entities
839 "contact" => "records/contacts",
840 "company" => "records/companies",
841 // records — events
842 "expense" => "records/expenses",
843 "meeting" => "records/meetings",
844 "decision" => "records/decisions",
845 "invoice" => "records/invoices",
846 // wiki — the SPEC type table files a wiki-page under `wiki/<topic>/`,
847 // i.e. ALWAYS a sub-folder, never flat under `wiki/`. A 2-component
848 // `wiki/<file>` path is non-conforming: `index::type_folder_of` /
849 // `validate::type_folder_of` require `<layer>/<type-folder>/<file>` (3
850 // components), so a flat wiki page either crashes write-through
851 // (`on_write` tries to create `index.md` *inside* a file) or is silently
852 // dropped from every catalog by `rebuild_all`. `topic` is the page's
853 // canonical bucket; with only the bare type in hand here, `wiki/topics`
854 // is the deterministic default folder (matches the dogfood store).
855 "wiki-page" => "wiki/topics",
856 // unrecognized: bare type name under records/
857 other => return PathBuf::from("records").join(other),
858 };
859 PathBuf::from(path)
860}
861
862/// The canonical [`Layer`] a `type_` belongs to, derived from its default
863/// type-folder (`email` → `Sources`, `contact` → `Records`, `wiki-page` →
864/// `Wiki`, unrecognized → `Records`). The write path uses this to decide whether
865/// an agent-supplied folder is in the *right* layer for the type before honouring
866/// its sub-folder choice.
867pub fn layer_for_type(type_: &str) -> Layer {
868 layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
869}
870
871/// The [`Layer`] a type-folder path lives in, read from its first component
872/// (`sources/` → `Sources`, `records/` → `Records`, `wiki/` → `Wiki`). Used to
873/// bound [`Store::find_by_type`]'s whole-layer sidecar read to a single layer
874/// subtree. Returns `None` for a path with no recognized layer prefix; every
875/// value [`default_type_folder`] produces has one, so in practice this is
876/// always `Some` on the call path — `None` degrades to a store-wide read.
877fn layer_of_folder(folder: &Path) -> Option<Layer> {
878 let first = folder.components().next()?.as_os_str().to_str()?;
879 Layer::from_dir_name(first)
880}
881
882/// Infer a content file's canonical `type` from its store-relative path — the
883/// inverse of [`default_type_folder`] and the single source of truth for
884/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
885///
886/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
887/// shorter path (a file directly under a layer) or an unknown leading layer
888/// yields `None`.
889///
890/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
891/// unrecognized folder the fallback is the **bare folder name verbatim** (no
892/// pluralization/singularization) so it round-trips with `default_type_folder`,
893/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
894/// Singularizing here would break that round-trip (`records/tasks` → `task`
895/// while `default_type_folder("task")` → `records/task`). `wiki/<topic>` always
896/// infers `wiki-page`, since every wiki page is filed under a topic folder.
897pub fn infer_type_from_path(rel: &Path) -> Option<String> {
898 let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
899 let layer = comps.next()?;
900 if !matches!(layer, "sources" | "records" | "wiki") {
901 return None;
902 }
903 let folder = comps.next()?;
904 // The file itself must be a third component (a real type-folder, not the
905 // file sitting directly under the layer).
906 comps.next()?;
907
908 let mapped = match (layer, folder) {
909 ("sources", "emails") => "email",
910 ("sources", "transcripts") => "transcript",
911 ("sources", "docs") => "pdf-source",
912 ("records", "contacts") => "contact",
913 ("records", "companies") => "company",
914 ("records", "expenses") => "expense",
915 ("records", "meetings") => "meeting",
916 ("records", "decisions") => "decision",
917 ("records", "invoices") => "invoice",
918 // Every wiki page is filed under `wiki/<topic>/`; the type is always
919 // `wiki-page` regardless of the topic-folder name.
920 ("wiki", _) => "wiki-page",
921 // Unrecognized folder: the bare name, verbatim. This is the inverse of
922 // `default_type_folder`'s unrecognized fallback (`other → records/other`)
923 // and the round-trip would break if we pluralized/singularized here.
924 (_, other) => other,
925 };
926 Some(mapped.to_string())
927}
928
929/// The primary date field name for a sharding type (the field whose value
930/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
931fn primary_date_field(type_: &str) -> Option<&'static str> {
932 match type_ {
933 "email" => Some("date"),
934 "transcript" => Some("recorded_at"),
935 "pdf-source" => Some("received_at"),
936 "expense" | "invoice" | "meeting" => Some("date"),
937 // recognized custom event types have no canonical date field name; they
938 // fall back to `created`.
939 _ => None,
940 }
941}
942
943/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
944/// string and a YAML-native scalar rendered to string.
945fn value_to_datetime(value: &serde_norway::Value) -> Option<DateTime<FixedOffset>> {
946 let s = yaml_scalar_string(value)?;
947 DateTime::parse_from_rfc3339(s.trim()).ok()
948}
949
950/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
951/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
952/// `2026-05-22T10:00:00-07:00` timestamp both work.
953fn value_to_year_month(value: &serde_norway::Value) -> Option<(String, String)> {
954 let s = yaml_scalar_string(value)?;
955 year_month_from_str(s.trim())
956}
957
958/// `(YYYY, MM)` from the leading `YYYY-MM` of a date string.
959fn year_month_from_str(s: &str) -> Option<(String, String)> {
960 // Hand-roll the leading-`YYYY-MM` parse to avoid a regex compile on the
961 // write path. Require: 4 digits, '-', 2 digits.
962 let bytes = s.as_bytes();
963 if bytes.len() < 7 {
964 return None;
965 }
966 let is_digit = |b: u8| b.is_ascii_digit();
967 if !(is_digit(bytes[0])
968 && is_digit(bytes[1])
969 && is_digit(bytes[2])
970 && is_digit(bytes[3])
971 && bytes[4] == b'-'
972 && is_digit(bytes[5])
973 && is_digit(bytes[6]))
974 {
975 return None;
976 }
977 let month: u8 = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
978 if !(1..=12).contains(&month) {
979 return None;
980 }
981 Some((s[0..4].to_string(), s[5..7].to_string()))
982}
983
984/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
985/// value's compact YAML serialization (covers timestamps that the YAML engine
986/// may surface as a non-string scalar).
987fn yaml_scalar_string(value: &serde_norway::Value) -> Option<String> {
988 if let Some(s) = value.as_str() {
989 return Some(s.to_string());
990 }
991 match value {
992 serde_norway::Value::Null => None,
993 serde_norway::Value::Mapping(_) | serde_norway::Value::Sequence(_) => None,
994 other => serde_norway::to_string(other)
995 .ok()
996 .map(|s| s.trim().to_string()),
997 }
998}
999
1000/// The YAML frontmatter block of a file: the text between a leading `---` fence
1001/// and the next `---` fence, exclusive. `None` if the file does not open with a
1002/// `---` fence on its first line.
1003fn frontmatter_block(text: &str) -> Option<&str> {
1004 // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
1005 let body = text.strip_prefix('\u{feff}').unwrap_or(text);
1006 let mut rest = body;
1007 // First line must be exactly `---` (allowing trailing CR).
1008 let (first, after_first) = split_first_line(rest);
1009 if first.trim_end_matches('\r') != "---" {
1010 return None;
1011 }
1012 rest = after_first;
1013 let block_start = rest;
1014 let mut scanned = 0usize;
1015 loop {
1016 let (line, after) = split_first_line(rest);
1017 if line.trim_end_matches('\r') == "---" {
1018 return Some(&block_start[..scanned]);
1019 }
1020 if after.is_empty() && line.is_empty() {
1021 // Reached end of input without a closing fence.
1022 return None;
1023 }
1024 scanned += line.len() + 1; // +1 for the consumed '\n'
1025 if after.is_empty() {
1026 return None;
1027 }
1028 rest = after;
1029 }
1030}
1031
1032/// Split a string into (first line without its trailing `\n`, remainder after
1033/// the `\n`). If there is no newline, the whole string is the line and the
1034/// remainder is empty.
1035fn split_first_line(s: &str) -> (&str, &str) {
1036 match s.find('\n') {
1037 Some(i) => (&s[..i], &s[i + 1..]),
1038 None => (s, ""),
1039 }
1040}
1041
1042/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
1043/// typed columns first and then the flattened `fields` map.
1044fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
1045 match key {
1046 "type" => record.type_ == value,
1047 "summary" => record.summary == value,
1048 "path" => record.path.to_string_lossy() == value,
1049 "created" => timestamp_matches(record.created, value),
1050 "updated" => timestamp_matches(record.updated, value),
1051 "tags" => record.tags.iter().any(|t| t == value),
1052 "links" => record.links.iter().any(|l| l == value),
1053 other => record
1054 .fields
1055 .get(other)
1056 .map(|v| json_value_matches(v, value))
1057 .unwrap_or(false),
1058 }
1059}
1060
1061/// Compare a record's `created`/`updated` instant against a query `value`.
1062///
1063/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1064/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1065/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1066/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1067/// the `Z` form an agent reads straight out of the file would never match. We
1068/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1069/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1070/// can never equal a real timestamp, so it falls through to `false`.
1071fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1072 match (stored, DateTime::parse_from_rfc3339(value)) {
1073 (Some(stored), Ok(queried)) => stored == queried,
1074 _ => false,
1075 }
1076}
1077
1078/// Compare a JSON field value against a query string. A string matches
1079/// verbatim; scalars match their textual form; an array matches if any element
1080/// matches (so a list-valued frontmatter field is membership-queried).
1081fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1082 match v {
1083 serde_json::Value::String(s) => s == value,
1084 serde_json::Value::Bool(b) => b.to_string() == value,
1085 serde_json::Value::Number(n) => n.to_string() == value,
1086 serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1087 // A present-but-null field never matches — consistent with the in-memory
1088 // post-filter (`query::json_value_matches`, which the first `where`
1089 // clause is NOT re-checked against, so the two must agree here or a
1090 // `--where field=` query would return different rows than `--type X
1091 // --where field=`).
1092 serde_json::Value::Null => false,
1093 serde_json::Value::Object(_) => false,
1094 }
1095}
1096
1097#[cfg(test)]
1098mod tests {
1099 use super::*;
1100 use std::fs;
1101 use tempfile::{tempdir, TempDir};
1102
1103 // ── Fixtures ────────────────────────────────────────────────────────────
1104
1105 /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1106 /// store-relative path for convenient assertions.
1107 fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1108 let abs = root.join(rel);
1109 fs::create_dir_all(abs.parent().unwrap()).unwrap();
1110 fs::write(&abs, contents).unwrap();
1111 PathBuf::from(rel)
1112 }
1113
1114 /// A minimal content file with the given `updated` timestamp in frontmatter.
1115 fn content_md(updated: &str) -> String {
1116 format!(
1117 "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1118 )
1119 }
1120
1121 /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1122 /// real parser is exercised).
1123 fn empty_store() -> TempDir {
1124 let dir = tempdir().unwrap();
1125 fs::write(
1126 dir.path().join("DB.md"),
1127 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1128 )
1129 .unwrap();
1130 dir
1131 }
1132
1133 /// Open a store rooted at a TempDir; panics if `open` rejects it.
1134 fn open(dir: &TempDir) -> Store {
1135 Store::open(dir.path()).expect("fixture should be a valid store")
1136 }
1137
1138 fn rels(paths: &[PathBuf]) -> Vec<String> {
1139 paths
1140 .iter()
1141 .map(|p| p.to_string_lossy().replace('\\', "/"))
1142 .collect()
1143 }
1144
1145 // ── Layer ───────────────────────────────────────────────────────────────
1146
1147 #[test]
1148 fn layer_dir_name_and_parse_are_inverse() {
1149 for layer in Layer::all() {
1150 assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1151 }
1152 assert_eq!(Layer::Sources.dir_name(), "sources");
1153 assert_eq!(Layer::Records.dir_name(), "records");
1154 assert_eq!(Layer::Wiki.dir_name(), "wiki");
1155 assert_eq!(Layer::from_dir_name("log"), None);
1156 assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1157 }
1158
1159 #[test]
1160 fn layer_order_is_canonical() {
1161 // stats keys a BTreeMap on Layer; the sort order must be sources<records<wiki.
1162 let mut v = [Layer::Wiki, Layer::Sources, Layer::Records];
1163 v.sort();
1164 assert_eq!(v, [Layer::Sources, Layer::Records, Layer::Wiki]);
1165 }
1166
1167 // ── is_db_md_store / open ────────────────────────────────────────────────
1168
1169 #[test]
1170 fn is_store_true_only_with_uppercase_marker() {
1171 let dir = tempdir().unwrap();
1172 assert!(
1173 !Store::is_db_md_store(dir.path()),
1174 "no marker → not a store"
1175 );
1176
1177 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1178 assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1179 }
1180
1181 #[test]
1182 fn is_store_false_for_lowercase_db_md() {
1183 // The case-sensitivity contract: a lowercase db.md is the spec name, not
1184 // a marker — even on a case-insensitive filesystem where Path::exists
1185 // would lie. This test must pass on macOS (case-insensitive) too.
1186 let dir = tempdir().unwrap();
1187 fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1188 assert!(
1189 !Store::is_db_md_store(dir.path()),
1190 "lowercase db.md must NOT be treated as a store marker"
1191 );
1192 assert!(Store::open(dir.path()).is_err());
1193 }
1194
1195 #[test]
1196 fn is_store_false_when_db_md_is_a_directory() {
1197 let dir = tempdir().unwrap();
1198 fs::create_dir(dir.path().join("DB.md")).unwrap();
1199 assert!(
1200 !Store::is_db_md_store(dir.path()),
1201 "a directory named DB.md is not the file marker"
1202 );
1203 }
1204
1205 #[test]
1206 fn open_rejects_non_store_with_path() {
1207 let dir = tempdir().unwrap();
1208 let err = Store::open(dir.path()).unwrap_err();
1209 assert_eq!(err.path, dir.path());
1210 }
1211
1212 #[test]
1213 fn open_succeeds_and_parses_config() {
1214 let dir = tempdir().unwrap();
1215 // A DB.md whose ## Policies declares a frozen page — proves open()
1216 // actually parsed the config rather than substituting a default.
1217 fs::write(
1218 dir.path().join("DB.md"),
1219 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1220 ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1221 )
1222 .unwrap();
1223 let store = Store::open(dir.path()).unwrap();
1224 assert_eq!(store.root, dir.path());
1225 assert!(
1226 store
1227 .config
1228 .frozen_pages
1229 .iter()
1230 .any(|p| p == Path::new("records/decisions/q1.md")),
1231 "open() must surface DB.md ## Policies, got {:?}",
1232 store.config.frozen_pages
1233 );
1234 }
1235
1236 // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1237
1238 #[test]
1239 fn walk_collects_content_across_layers_skipping_meta_and_log() {
1240 let dir = empty_store();
1241 let root = dir.path();
1242 write(
1243 root,
1244 "sources/emails/2026/05/a.md",
1245 &content_md("2026-05-01T00:00:00Z"),
1246 );
1247 write(
1248 root,
1249 "records/contacts/sarah.md",
1250 &content_md("2026-05-02T00:00:00Z"),
1251 );
1252 write(
1253 root,
1254 "wiki/people/sarah.md",
1255 &content_md("2026-05-03T00:00:00Z"),
1256 );
1257 // Things walk() must SKIP:
1258 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1259 write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1260 write(root, "log.md", "---\ntype: log\n---\n"); // log
1261 write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1262 write(
1263 root,
1264 "sources/.hidden/secret.md",
1265 &content_md("2026-05-09T00:00:00Z"),
1266 ); // hidden dir
1267 write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1268
1269 let store = open(&dir);
1270 let got = rels(&store.walk().unwrap());
1271 assert_eq!(
1272 got,
1273 vec![
1274 "records/contacts/sarah.md".to_string(),
1275 "sources/emails/2026/05/a.md".to_string(),
1276 "wiki/people/sarah.md".to_string(),
1277 ]
1278 );
1279 }
1280
1281 #[test]
1282 fn walk_includes_content_named_log_md_or_db_md_inside_a_layer() {
1283 let dir = empty_store();
1284 let root = dir.path();
1285 // A content file that merely happens to be named log.md / DB.md INSIDE a
1286 // layer is real content — those names are reserved only at the store root.
1287 write(
1288 root,
1289 "records/configs/log.md",
1290 &content_md("2026-05-01T00:00:00Z"),
1291 );
1292 write(
1293 root,
1294 "sources/docs/DB.md",
1295 &content_md("2026-05-02T00:00:00Z"),
1296 );
1297 // The derived catalog twin is still skipped at any depth.
1298 write(root, "records/configs/index.md", "---\ntype: index\n---\n");
1299 let store = open(&dir);
1300 let got = rels(&store.walk().unwrap());
1301 assert!(
1302 got.contains(&"records/configs/log.md".to_string()),
1303 "layer-internal log.md is content: {got:?}"
1304 );
1305 assert!(
1306 got.contains(&"sources/docs/DB.md".to_string()),
1307 "layer-internal DB.md is content: {got:?}"
1308 );
1309 assert!(
1310 !got.iter().any(|p| p.ends_with("index.md")),
1311 "index.md is still skipped: {got:?}"
1312 );
1313 }
1314
1315 #[test]
1316 fn walk_layer_is_scoped() {
1317 let dir = empty_store();
1318 let root = dir.path();
1319 write(
1320 root,
1321 "sources/emails/2026/05/a.md",
1322 &content_md("2026-05-01T00:00:00Z"),
1323 );
1324 write(
1325 root,
1326 "records/contacts/sarah.md",
1327 &content_md("2026-05-02T00:00:00Z"),
1328 );
1329 let store = open(&dir);
1330
1331 assert_eq!(
1332 rels(&store.walk_layer(Layer::Sources).unwrap()),
1333 vec!["sources/emails/2026/05/a.md".to_string()]
1334 );
1335 assert_eq!(
1336 rels(&store.walk_layer(Layer::Records).unwrap()),
1337 vec!["records/contacts/sarah.md".to_string()]
1338 );
1339 // A layer with no directory is empty, not an error.
1340 assert!(store.walk_layer(Layer::Wiki).unwrap().is_empty());
1341 }
1342
1343 #[test]
1344 fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1345 let dir = empty_store();
1346 let root = dir.path();
1347 write(
1348 root,
1349 "sources/emails/2026/05/a.md",
1350 &content_md("2026-05-01T00:00:00Z"),
1351 );
1352 write(
1353 root,
1354 "sources/emails/2026/06/b.md",
1355 &content_md("2026-06-01T00:00:00Z"),
1356 );
1357 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1358 // A different type folder must not leak in.
1359 write(
1360 root,
1361 "sources/docs/2026/05/c.md",
1362 &content_md("2026-05-04T00:00:00Z"),
1363 );
1364 let store = open(&dir);
1365
1366 let expected = vec![
1367 "sources/emails/2026/05/a.md".to_string(),
1368 "sources/emails/2026/06/b.md".to_string(),
1369 ];
1370 // Relative folder arg.
1371 assert_eq!(
1372 rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1373 expected
1374 );
1375 // Absolute folder arg under the store resolves identically.
1376 assert_eq!(
1377 rels(
1378 &store
1379 .walk_type_folder(&root.join("sources/emails"))
1380 .unwrap()
1381 ),
1382 expected
1383 );
1384 }
1385
1386 // ── recent_in_type_folder ────────────────────────────────────────────────
1387
1388 #[test]
1389 fn recent_orders_by_updated_desc_then_path_and_caps() {
1390 let dir = empty_store();
1391 let root = dir.path();
1392 // newest
1393 write(
1394 root,
1395 "records/meetings/2026/05/c.md",
1396 &content_md("2026-05-03T00:00:00Z"),
1397 );
1398 // tie on updated — path asc decides (a before b)
1399 write(
1400 root,
1401 "records/meetings/2026/05/a.md",
1402 &content_md("2026-05-02T00:00:00Z"),
1403 );
1404 write(
1405 root,
1406 "records/meetings/2026/05/b.md",
1407 &content_md("2026-05-02T00:00:00Z"),
1408 );
1409 // oldest
1410 write(
1411 root,
1412 "records/meetings/2026/04/z.md",
1413 &content_md("2026-04-01T00:00:00Z"),
1414 );
1415 let store = open(&dir);
1416
1417 let all = rels(
1418 &store
1419 .recent_in_type_folder(Path::new("records/meetings"), 10)
1420 .unwrap(),
1421 );
1422 assert_eq!(
1423 all,
1424 vec![
1425 "records/meetings/2026/05/c.md".to_string(), // newest
1426 "records/meetings/2026/05/a.md".to_string(), // tie, path asc
1427 "records/meetings/2026/05/b.md".to_string(),
1428 "records/meetings/2026/04/z.md".to_string(), // oldest
1429 ]
1430 );
1431
1432 // Cap takes the n most-recent.
1433 let top2 = rels(
1434 &store
1435 .recent_in_type_folder(Path::new("records/meetings"), 2)
1436 .unwrap(),
1437 );
1438 assert_eq!(
1439 top2,
1440 vec![
1441 "records/meetings/2026/05/c.md".to_string(),
1442 "records/meetings/2026/05/a.md".to_string(),
1443 ]
1444 );
1445 }
1446
1447 #[test]
1448 fn recent_sorts_undated_files_last() {
1449 let dir = empty_store();
1450 let root = dir.path();
1451 write(
1452 root,
1453 "records/contacts/dated.md",
1454 &content_md("2026-05-01T00:00:00Z"),
1455 );
1456 // No `updated` field at all.
1457 write(
1458 root,
1459 "records/contacts/undated.md",
1460 "---\ntype: contact\nsummary: x\n---\nbody\n",
1461 );
1462 let store = open(&dir);
1463 let got = rels(
1464 &store
1465 .recent_in_type_folder(Path::new("records/contacts"), 10)
1466 .unwrap(),
1467 );
1468 assert_eq!(
1469 got,
1470 vec![
1471 "records/contacts/dated.md".to_string(),
1472 "records/contacts/undated.md".to_string(),
1473 ],
1474 "a file with a real `updated` must outrank one with none"
1475 );
1476 }
1477
1478 // ── type_shards ──────────────────────────────────────────────────────────
1479
1480 #[test]
1481 fn type_shards_classification() {
1482 let dir = empty_store();
1483 let store = open(&dir);
1484 for t in [
1485 "email",
1486 "transcript",
1487 "pdf-source",
1488 "expense",
1489 "invoice",
1490 "meeting",
1491 "order",
1492 "ticket",
1493 "transaction",
1494 ] {
1495 assert!(store.type_shards(t), "{t} should shard");
1496 }
1497 for t in [
1498 "contact",
1499 "company",
1500 "decision",
1501 "wiki-page",
1502 "index",
1503 "log",
1504 "db-md",
1505 "proposal",
1506 ] {
1507 assert!(!store.type_shards(t), "{t} should stay flat");
1508 }
1509 }
1510
1511 #[test]
1512 fn type_shards_respects_schema_directive_both_directions() {
1513 use crate::parser::{Config, Schema};
1514 let dir = empty_store();
1515 let mut store = open(&dir);
1516 let mut config = Config::default();
1517 // A CUSTOM type (not in the built-in list) opts into date-sharding —
1518 // without the schema override `type_shards` would return false for it.
1519 config.schemas.insert(
1520 "shipment".to_string(),
1521 Schema {
1522 shard: Some(true),
1523 ..Schema::default()
1524 },
1525 );
1526 // A BUILT-IN event type opts OUT (flat) — the override wins over the
1527 // built-in default.
1528 config.schemas.insert(
1529 "expense".to_string(),
1530 Schema {
1531 shard: Some(false),
1532 ..Schema::default()
1533 },
1534 );
1535 // A schema with no `shard:` directive leaves the built-in default intact.
1536 config
1537 .schemas
1538 .insert("meeting".to_string(), Schema::default());
1539 store.config = config;
1540
1541 assert!(
1542 store.type_shards("shipment"),
1543 "custom type with `shard: by-date` must shard"
1544 );
1545 assert!(
1546 !store.type_shards("expense"),
1547 "built-in event type with `shard: flat` must go flat"
1548 );
1549 assert!(
1550 store.type_shards("meeting"),
1551 "schema without a `shard:` directive keeps the built-in default"
1552 );
1553 assert!(
1554 !store.type_shards("contact"),
1555 "unconfigured entity type stays flat"
1556 );
1557 }
1558
1559 // ── shard_path_for ───────────────────────────────────────────────────────
1560
1561 fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
1562 let mut fm = Frontmatter::default();
1563 fm.extra.insert(
1564 key.to_string(),
1565 serde_norway::Value::String(value.to_string()),
1566 );
1567 fm
1568 }
1569
1570 fn fm_with_created(rfc3339: &str) -> Frontmatter {
1571 Frontmatter {
1572 created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
1573 ..Default::default()
1574 }
1575 }
1576
1577 #[test]
1578 fn shard_path_uses_primary_date_field_per_type() {
1579 let dir = empty_store();
1580 let store = open(&dir);
1581
1582 // expense.date → records/expenses/<YYYY>/<MM>/
1583 let p = store
1584 .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
1585 .unwrap();
1586 assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
1587
1588 // email.date → sources/emails/<YYYY>/<MM>/
1589 let p = store
1590 .shard_path_for(
1591 "email",
1592 &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
1593 "e1",
1594 )
1595 .unwrap();
1596 assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
1597
1598 // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
1599 let p = store
1600 .shard_path_for(
1601 "transcript",
1602 &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
1603 "t1",
1604 )
1605 .unwrap();
1606 assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
1607 }
1608
1609 #[test]
1610 fn shard_path_falls_back_to_created() {
1611 let dir = empty_store();
1612 let store = open(&dir);
1613 // meeting with no `date` field but a `created` timestamp.
1614 let p = store
1615 .shard_path_for(
1616 "meeting",
1617 &fm_with_created("2024-07-09T08:30:00-04:00"),
1618 "sync",
1619 )
1620 .unwrap();
1621 assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
1622 }
1623
1624 #[test]
1625 fn shard_path_primary_field_wins_over_created() {
1626 let dir = empty_store();
1627 let store = open(&dir);
1628 let mut fm = fm_with_created("2020-01-01T00:00:00Z");
1629 fm.extra.insert(
1630 "date".into(),
1631 serde_norway::Value::String("2026-05-22".into()),
1632 );
1633 let p = store.shard_path_for("expense", &fm, "x").unwrap();
1634 // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
1635 assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
1636 }
1637
1638 #[test]
1639 fn shard_path_flat_types_have_no_shard_segment() {
1640 let dir = empty_store();
1641 let store = open(&dir);
1642 // A contact has a `created` date, but contacts stay flat.
1643 let p = store
1644 .shard_path_for(
1645 "contact",
1646 &fm_with_created("2026-05-22T00:00:00Z"),
1647 "sarah-chen",
1648 )
1649 .unwrap();
1650 assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
1651
1652 // wiki-page is flat (no date shard) but still files under a type-folder:
1653 // `wiki/topics/<name>.md`, NEVER flat as `wiki/<name>.md`. A 2-component
1654 // path is invisible to the index/validate type-folder model.
1655 let p = store
1656 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1657 .unwrap();
1658 assert_eq!(p, PathBuf::from("wiki/topics/renewal-theme.md"));
1659 }
1660
1661 /// Regression: a wiki-page written through the toolkit's own path
1662 /// computation must land at a path the index + validate type-folder model
1663 /// accepts. `shard_path_for("wiki-page", …)` previously returned a
1664 /// 2-component `wiki/<file>` path, which `type_folder_of` (in both `index`
1665 /// and `validate`) treats as "no type-folder" — so the page either crashed
1666 /// `Index::on_write` (it tried to create `index.md` inside a file) or was
1667 /// silently dropped from every catalog by `Index::rebuild_all`. The
1668 /// computed path must have 3 components: `<layer>/<type-folder>/<file>`.
1669 #[test]
1670 fn shard_path_wiki_page_is_indexable_three_component_path() {
1671 let dir = empty_store();
1672 let store = open(&dir);
1673 let p = store
1674 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1675 .unwrap();
1676 // First two components are a layer + a non-empty type-folder segment;
1677 // the file is the third. This is exactly the shape `type_folder_of`
1678 // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
1679 let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
1680 assert_eq!(
1681 comps.len(),
1682 3,
1683 "wiki-page path must be <layer>/<type-folder>/<file>, got {p:?}"
1684 );
1685 assert_eq!(comps[0], "wiki", "first component must be the wiki layer");
1686 assert!(
1687 !comps[1].is_empty() && comps[1] != "renewal-theme.md",
1688 "second component must be a real type-folder, not the file: {p:?}"
1689 );
1690 assert!(
1691 comps[2].ends_with(".md"),
1692 "third component must be the .md file: {p:?}"
1693 );
1694 }
1695
1696 #[test]
1697 fn shard_path_preserves_and_adds_md_extension() {
1698 let dir = empty_store();
1699 let store = open(&dir);
1700 let with = store
1701 .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
1702 .unwrap();
1703 let without = store
1704 .shard_path_for("contact", &Frontmatter::default(), "sarah")
1705 .unwrap();
1706 assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
1707 assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
1708 }
1709
1710 #[test]
1711 fn shard_path_errors_when_sharding_type_has_no_date() {
1712 let dir = empty_store();
1713 let store = open(&dir);
1714 // expense shards, but no `date` and no `created` → NoShardDate.
1715 let err = store
1716 .shard_path_for("expense", &Frontmatter::default(), "mystery")
1717 .unwrap_err();
1718 match err {
1719 StoreError::NoShardDate { file } => {
1720 assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
1721 }
1722 other => panic!("expected NoShardDate, got {other:?}"),
1723 }
1724 }
1725
1726 // ── find_links_to ────────────────────────────────────────────────────────
1727
1728 #[test]
1729 fn find_links_to_matches_all_accepted_spellings() {
1730 let dir = empty_store();
1731 let root = dir.path();
1732 let target = "records/contacts/sarah-chen";
1733
1734 // Plain link.
1735 write(
1736 root,
1737 "wiki/people/sarah.md",
1738 &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1739 );
1740 // Link with display text.
1741 write(
1742 root,
1743 "records/meetings/2026/05/m.md",
1744 &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
1745 );
1746 // Link with .md extension (accepted, warned by validate).
1747 write(
1748 root,
1749 "wiki/themes/t.md",
1750 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}.md]]\n"),
1751 );
1752 // A catalog/index file also contains the link literally — included.
1753 write(
1754 root,
1755 "records/contacts/index.md",
1756 &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
1757 );
1758 // No link to the target.
1759 write(
1760 root,
1761 "wiki/people/elena.md",
1762 "---\ntype: wiki-page\nsummary: s\n---\nNo links here.\n",
1763 );
1764 // Short-form link must NOT match the full-path target.
1765 write(
1766 root,
1767 "wiki/people/bob.md",
1768 "---\ntype: wiki-page\nsummary: s\n---\n[[sarah-chen]]\n",
1769 );
1770 // A longer path that merely starts with the target must NOT match
1771 // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
1772 write(
1773 root,
1774 "wiki/people/jr.md",
1775 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}-jr]]\n"),
1776 );
1777
1778 let store = open(&dir);
1779 let got = rels(&store.find_links_to(Path::new(target)).unwrap());
1780 assert_eq!(
1781 got,
1782 vec![
1783 "records/contacts/index.md".to_string(),
1784 "records/meetings/2026/05/m.md".to_string(),
1785 "wiki/people/sarah.md".to_string(),
1786 "wiki/themes/t.md".to_string(),
1787 ]
1788 );
1789 }
1790
1791 #[test]
1792 fn find_links_to_distinguishes_sibling_paths() {
1793 // Two contacts whose paths share a prefix; a link to one must not be
1794 // reported as a link to the other.
1795 let dir = empty_store();
1796 let root = dir.path();
1797 write(
1798 root,
1799 "wiki/a.md",
1800 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah]]\n",
1801 );
1802 write(
1803 root,
1804 "wiki/b.md",
1805 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1806 );
1807 let store = open(&dir);
1808
1809 assert_eq!(
1810 rels(
1811 &store
1812 .find_links_to(Path::new("records/contacts/sarah"))
1813 .unwrap()
1814 ),
1815 vec!["wiki/a.md".to_string()]
1816 );
1817 assert_eq!(
1818 rels(
1819 &store
1820 .find_links_to(Path::new("records/contacts/sarah-chen"))
1821 .unwrap()
1822 ),
1823 vec!["wiki/b.md".to_string()]
1824 );
1825 }
1826
1827 #[test]
1828 fn regression_find_links_to_tolerates_invalid_utf8_on_a_matched_line() {
1829 // Regression: the scan used the `UTF8` sink, which ran
1830 // `std::str::from_utf8` on every matched line and returned an
1831 // `io::Error` when a `.md` file carried a stray non-UTF-8 byte on the
1832 // SAME line as a `[[target]]` link. That error propagated out and
1833 // aborted the WHOLE store scan — `find_links_to` / `find_links_to_any`
1834 // (and `graph backlinks` + the working-set validate incoming-linker
1835 // pass) returned an error instead of the legitimate UTF-8 linkers.
1836 // Verbatim-ingested `sources/` artifacts can carry such bytes, so this
1837 // is reachable. The `Lossy` sink must let the scan still report the link.
1838 let dir = empty_store();
1839 let root = dir.path();
1840 let target = "records/contacts/sarah-chen";
1841
1842 // A clean, fully-UTF-8 linker that MUST be returned regardless.
1843 write(
1844 root,
1845 "wiki/people/clean.md",
1846 &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1847 );
1848
1849 // A linker whose link line ALSO carries a stray 0xFF byte (a mis-decoded
1850 // Latin-1 import). Write raw bytes so the invalid byte survives — a
1851 // `&str` fixture could not express it. The byte-level regex still
1852 // matches `[[target]]` on this line; pre-fix the UTF8 sink aborted here.
1853 let mut bytes: Vec<u8> =
1854 b"---\ntype: email\nsummary: s\n---\nSee [[records/contacts/sarah-chen]] \xFF here\n"
1855 .to_vec();
1856 let dirty_abs = root.join("sources/emails/2026/05/raw.md");
1857 fs::create_dir_all(dirty_abs.parent().unwrap()).unwrap();
1858 fs::write(&dirty_abs, &bytes).unwrap();
1859 // Defensive: confirm the fixture really is invalid UTF-8 (so the test
1860 // exercises the bug, not a coincidentally-valid file).
1861 assert!(
1862 std::str::from_utf8(&bytes).is_err(),
1863 "fixture must contain invalid UTF-8 to exercise the regression"
1864 );
1865 bytes.clear();
1866
1867 let store = open(&dir);
1868 let got = rels(
1869 &store
1870 .find_links_to(Path::new(target))
1871 .expect("a stray non-UTF-8 byte must not abort the backlink scan"),
1872 );
1873 assert_eq!(
1874 got,
1875 vec![
1876 "sources/emails/2026/05/raw.md".to_string(),
1877 "wiki/people/clean.md".to_string(),
1878 ],
1879 "both the clean linker and the one with an invalid byte on the link \
1880 line are reported; the scan degrades, it does not fail"
1881 );
1882 }
1883
1884 // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
1885
1886 /// The working-set validate's incoming-linker discovery runs through
1887 /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
1888 /// batch contract that makes that single-pass behavior correct: the result is
1889 /// the union of incoming linkers across every target, with per-target
1890 /// boundary correctness preserved (no alternation arm bleeds into a
1891 /// prefix-sharing sibling). If a regression reverts the batch finder to a
1892 /// per-object loop, the union below would still hold — but the boundary +
1893 /// union-equivalence assertions are what guard the *correctness* of folding N
1894 /// scans into one regex.
1895 #[test]
1896 fn find_links_to_any_returns_the_union_with_boundary_correctness() {
1897 let dir = empty_store();
1898 let root = dir.path();
1899
1900 // Two distinct targets, each with its own linker.
1901 write(
1902 root,
1903 "wiki/links-sarah.md",
1904 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1905 );
1906 write(
1907 root,
1908 "wiki/links-acme.md",
1909 "---\ntype: wiki-page\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
1910 );
1911 // One file links to BOTH targets — must appear exactly once (deduped),
1912 // proving the per-file early-exit folds multiple-target hits into a
1913 // single result row rather than one row per matched target.
1914 write(
1915 root,
1916 "records/meetings/2026/05/m.md",
1917 "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
1918 [[records/companies/acme]]\n",
1919 );
1920 // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
1921 // be reported as a link to `sarah-chen` even though the alternation now
1922 // carries `sarah-chen` as one arm.
1923 write(
1924 root,
1925 "wiki/links-jr.md",
1926 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
1927 );
1928 // A file that links to neither requested target.
1929 write(
1930 root,
1931 "wiki/unrelated.md",
1932 "---\ntype: wiki-page\nsummary: s\n---\n[[wiki/themes/spend]]\n",
1933 );
1934
1935 let store = open(&dir);
1936 let targets = vec![
1937 PathBuf::from("records/contacts/sarah-chen"),
1938 PathBuf::from("records/companies/acme"),
1939 ];
1940
1941 let got = rels(&store.find_links_to_any(&targets).unwrap());
1942 assert_eq!(
1943 got,
1944 vec![
1945 "records/meetings/2026/05/m.md".to_string(),
1946 "wiki/links-acme.md".to_string(),
1947 "wiki/links-sarah.md".to_string(),
1948 ],
1949 "batch finder must return the deduped union of linkers across all \
1950 targets, excluding the prefix-sibling and the unrelated file"
1951 );
1952
1953 // Equivalence: the batch result must equal the union of the per-target
1954 // single finder. This is the property the working-set path relies on
1955 // when it folds one-scan-per-object into one scan for the whole set.
1956 let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
1957 for t in &targets {
1958 for linker in store.find_links_to(t).unwrap() {
1959 union.insert(linker);
1960 }
1961 }
1962 assert_eq!(
1963 rels(&union.into_iter().collect::<Vec<_>>()),
1964 got,
1965 "find_links_to_any must equal the union of per-target find_links_to"
1966 );
1967 }
1968
1969 /// An empty target set must scan nothing and find nothing — and crucially
1970 /// must NOT compile to a match-everything empty regex (which would report
1971 /// every `.md` as a linker). This is the empty-working-set fast path the
1972 /// `validate` loop hits when nothing changed.
1973 #[test]
1974 fn find_links_to_any_empty_targets_matches_nothing() {
1975 let dir = empty_store();
1976 let root = dir.path();
1977 write(
1978 root,
1979 "wiki/a.md",
1980 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1981 );
1982 let store = open(&dir);
1983
1984 assert!(
1985 store.find_links_to_any(&[]).unwrap().is_empty(),
1986 "no targets ⇒ no linkers (an empty pattern must not match every file)"
1987 );
1988 // A set of only empty/non-link targets is likewise a no-op, not a
1989 // match-everything.
1990 assert!(
1991 store
1992 .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
1993 .unwrap()
1994 .is_empty(),
1995 "targets that render to empty link text contribute no alternation arm"
1996 );
1997 }
1998
1999 // ── read_type_index ──────────────────────────────────────────────────────
2000
2001 #[test]
2002 fn read_type_index_parses_records_and_flattens_fields() {
2003 let dir = empty_store();
2004 let root = dir.path();
2005 let jsonl = "\
2006{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
2007{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
2008";
2009 let p = write(root, "records/expenses/index.jsonl", jsonl);
2010 let store = open(&dir);
2011 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
2012
2013 assert_eq!(recs.len(), 2);
2014 // Sorted by path asc.
2015 assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
2016 assert_eq!(recs[0].type_, "expense");
2017 assert_eq!(recs[0].summary, "lunch");
2018 assert_eq!(recs[0].tags, vec!["meals".to_string()]);
2019 assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
2020 assert!(recs[0].created.is_some());
2021 // Extra (non-typed) frontmatter flattens into `fields`.
2022 assert_eq!(
2023 recs[0].fields.get("vendor"),
2024 Some(&serde_json::json!("acme"))
2025 );
2026 assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
2027 // Defaults: missing tags/links → empty.
2028 assert!(recs[1].tags.is_empty());
2029 assert!(recs[1].links.is_empty());
2030 }
2031
2032 #[test]
2033 fn read_type_index_last_write_wins_and_skips_blanks() {
2034 let dir = empty_store();
2035 let root = dir.path();
2036 // Same path twice; the second line supersedes the first. A blank line
2037 // in between must be ignored, not error.
2038 let jsonl = "\
2039{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
2040
2041{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
2042";
2043 let p = write(root, "records/contacts/index.jsonl", jsonl);
2044 let store = open(&dir);
2045 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
2046 assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
2047 assert_eq!(recs[0].summary, "new", "later line must win");
2048 }
2049
2050 #[test]
2051 fn read_type_index_errors_on_malformed_line() {
2052 let dir = empty_store();
2053 let root = dir.path();
2054 let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
2055 let store = open(&dir);
2056 let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
2057 assert!(matches!(err, StoreError::BadTypeIndex { .. }));
2058 }
2059
2060 // ── find_by_type / find_by_where ─────────────────────────────────────────
2061
2062 fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
2063 format!(
2064 "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
2065 )
2066 }
2067
2068 #[test]
2069 fn find_by_type_reads_canonical_folder_sidecar() {
2070 let dir = empty_store();
2071 let root = dir.path();
2072 // Canonical folder for `contact` is records/contacts.
2073 write(
2074 root,
2075 "records/contacts/index.jsonl",
2076 &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
2077 + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
2078 );
2079 // A different type's sidecar must not leak into a contact query.
2080 write(
2081 root,
2082 "records/companies/index.jsonl",
2083 &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2084 );
2085 let store = open(&dir);
2086 let recs = store.find_by_type("contact").unwrap();
2087 let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
2088 assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
2089 assert!(recs.iter().all(|r| r.type_ == "contact"));
2090 }
2091
2092 #[test]
2093 fn regression_find_by_type_includes_non_canonical_folder_when_canonical_exists() {
2094 // Regression for the silent-incompleteness bug: once the canonical
2095 // type-folder sidecar exists, `find_by_type` used to read ONLY that
2096 // sidecar and drop same-type records filed in a non-canonical folder in
2097 // the SAME layer — so the result flipped to incomplete the moment a
2098 // canonical record was added. The write path actively enables such a
2099 // layout (`records/clients/` for a `contact`, `wiki/<topic>/` for any
2100 // `wiki-page`), so this is a reachable, dedup-breaking omission.
2101 let dir = empty_store();
2102 let root = dir.path();
2103
2104 // CANONICAL folder sidecar exists (`records/contacts/` for `contact`),
2105 // which is exactly the condition that triggered the bug.
2106 write(
2107 root,
2108 "records/contacts/index.jsonl",
2109 &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2110 );
2111 // A `contact` filed in a NON-canonical folder within the same (Records)
2112 // layer. Pre-fix this was silently dropped because the canonical
2113 // sidecar existed; it must now come back.
2114 write(
2115 root,
2116 "records/clients/index.jsonl",
2117 &jsonl_line("records/clients/elena.md", "contact", "Elena", ""),
2118 );
2119 // A different type in the same layer must NOT leak in (proves the read
2120 // is type-filtered, not just a blind whole-layer dump).
2121 write(
2122 root,
2123 "records/companies/index.jsonl",
2124 &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
2125 );
2126
2127 let store = open(&dir);
2128 let got: std::collections::BTreeSet<String> = store
2129 .find_by_type("contact")
2130 .unwrap()
2131 .into_iter()
2132 .map(|r| r.path.to_string_lossy().into_owned())
2133 .collect();
2134 assert_eq!(
2135 got,
2136 ["records/clients/elena.md", "records/contacts/sarah.md"]
2137 .into_iter()
2138 .map(String::from)
2139 .collect::<std::collections::BTreeSet<_>>(),
2140 "both the canonical-folder and the non-canonical-folder contact must \
2141 be returned; the company record must be excluded"
2142 );
2143 }
2144
2145 #[test]
2146 fn regression_find_by_type_wiki_page_spans_multiple_topic_folders() {
2147 // Regression for the scoped-backlinks variant of the same bug
2148 // (`graph backlinks --type wiki-page`): `wiki-page`'s canonical folder
2149 // is `wiki/topics`, but the SPEC files wiki pages under `wiki/<topic>/`
2150 // for ANY topic. With a `wiki/topics/index.jsonl` present, the old code
2151 // read only that folder and dropped pages in `wiki/people/`,
2152 // `wiki/projects/`, etc. — under-reporting dependents in a blast-radius
2153 // check. The whole-`wiki/`-layer read must surface all of them.
2154 let dir = empty_store();
2155 let root = dir.path();
2156 write(
2157 root,
2158 "wiki/topics/index.jsonl",
2159 &jsonl_line("wiki/topics/billing.md", "wiki-page", "Billing", ""),
2160 );
2161 write(
2162 root,
2163 "wiki/people/index.jsonl",
2164 &jsonl_line("wiki/people/sarah-chen.md", "wiki-page", "Sarah Chen", ""),
2165 );
2166 write(
2167 root,
2168 "wiki/projects/index.jsonl",
2169 &jsonl_line("wiki/projects/atlas.md", "wiki-page", "Atlas", ""),
2170 );
2171
2172 let store = open(&dir);
2173 let got: std::collections::BTreeSet<String> = store
2174 .find_by_type("wiki-page")
2175 .unwrap()
2176 .into_iter()
2177 .map(|r| r.path.to_string_lossy().into_owned())
2178 .collect();
2179 assert_eq!(
2180 got,
2181 [
2182 "wiki/people/sarah-chen.md",
2183 "wiki/projects/atlas.md",
2184 "wiki/topics/billing.md",
2185 ]
2186 .into_iter()
2187 .map(String::from)
2188 .collect::<std::collections::BTreeSet<_>>(),
2189 "a wiki-page query must return pages from every topic folder, not \
2190 just the canonical wiki/topics/"
2191 );
2192 }
2193
2194 #[test]
2195 fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
2196 let dir = empty_store();
2197 let root = dir.path();
2198 // A custom `proposal` record filed under a non-canonical folder NAME
2199 // (the natural plural `records/proposals/`) inside the records layer.
2200 // `default_type_folder("proposal")` = `records/proposal` (bare type, no
2201 // pluralization guess), so the canonical sidecar does not exist and
2202 // `find_by_type` falls back. The fallback is bounded to the type's
2203 // layer (records), so this record — same layer, non-canonical folder —
2204 // is still found: completeness within the layer holds.
2205 write(
2206 root,
2207 "records/proposals/index.jsonl",
2208 &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
2209 );
2210 // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
2211 // old whole-store fallback read every sidecar in the store and would
2212 // have leaked this into the result; the layer-bounded fallback must not.
2213 // It also pins that the fallback is O(entities-in-layer), never O(store).
2214 write(
2215 root,
2216 "sources/proposals/index.jsonl",
2217 &jsonl_line(
2218 "sources/proposals/leak.md",
2219 "proposal",
2220 "cross-layer decoy",
2221 "",
2222 ),
2223 );
2224 let store = open(&dir);
2225 let recs = store.find_by_type("proposal").unwrap();
2226 assert_eq!(
2227 recs.len(),
2228 1,
2229 "only the records-layer proposal, not the sources decoy"
2230 );
2231 assert_eq!(recs[0].summary, "Q3 proposal");
2232 assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
2233 }
2234
2235 #[test]
2236 fn find_by_type_canonical_absent_does_not_read_other_layers() {
2237 let dir = empty_store();
2238 let root = dir.path();
2239 // `email`'s canonical folder is `sources/emails` (layer Sources). No
2240 // sidecar there yet, so `find_by_type("email")` falls back — but only
2241 // within the Sources layer. A populated sidecar in the Records layer
2242 // must never be touched: the fallback is layer-bounded, not store-wide.
2243 // Under the old `read_all_type_indexes_in(None)` fallback this records
2244 // sidecar would have been read and filtered (wasted O(store) I/O); now
2245 // it is outside the walk root entirely.
2246 write(
2247 root,
2248 "records/contacts/index.jsonl",
2249 &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
2250 );
2251 let store = open(&dir);
2252 // No email anywhere ⇒ empty, and the records layer was not in scope.
2253 assert!(store.find_by_type("email").unwrap().is_empty());
2254 }
2255
2256 #[test]
2257 fn find_by_where_matches_typed_columns_and_flat_fields() {
2258 let dir = empty_store();
2259 let root = dir.path();
2260 write(
2261 root,
2262 "records/expenses/index.jsonl",
2263 &(jsonl_line(
2264 "records/expenses/a.md",
2265 "expense",
2266 "lunch",
2267 ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
2268 ) + &jsonl_line(
2269 "records/expenses/b.md",
2270 "expense",
2271 "taxi",
2272 ",\"vendor\":\"yellow\"",
2273 )),
2274 );
2275 write(
2276 root,
2277 "records/contacts/index.jsonl",
2278 &jsonl_line(
2279 "records/contacts/sarah.md",
2280 "contact",
2281 "Sarah",
2282 ",\"tags\":[\"customer\"]",
2283 ),
2284 );
2285 let store = open(&dir);
2286
2287 // Flat field in `fields`.
2288 let by_vendor = store.find_by_where("vendor", "acme").unwrap();
2289 assert_eq!(by_vendor.len(), 1);
2290 assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
2291
2292 // Typed column: type (spans both expense records).
2293 assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
2294
2295 // Typed list column: tags membership.
2296 let customers = store.find_by_where("tags", "customer").unwrap();
2297 assert_eq!(customers.len(), 1);
2298 assert_eq!(
2299 customers[0].path,
2300 PathBuf::from("records/contacts/sarah.md")
2301 );
2302
2303 // No match → empty.
2304 assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2305 }
2306
2307 #[test]
2308 fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2309 let dir = empty_store();
2310 let root = dir.path();
2311 // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2312 // serialized from such a file preserves it verbatim.
2313 write(
2314 root,
2315 "records/meetings/index.jsonl",
2316 "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2317\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2318\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2319 );
2320 let store = open(&dir);
2321
2322 // The exact value an agent reads out of the file (`Z` form) must match.
2323 let by_z = store
2324 .find_by_where("created", "2026-05-01T00:00:00Z")
2325 .unwrap();
2326 assert_eq!(by_z.len(), 1);
2327 assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2328
2329 // The equivalent explicit-offset spelling of the same instant matches too.
2330 assert_eq!(
2331 store
2332 .find_by_where("created", "2026-05-01T00:00:00+00:00")
2333 .unwrap()
2334 .len(),
2335 1
2336 );
2337
2338 // A non-UTC stored value matches both its own offset spelling and the
2339 // same instant expressed as `Z` (instant comparison, not string compare).
2340 assert_eq!(
2341 store
2342 .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2343 .unwrap()
2344 .len(),
2345 1
2346 );
2347 assert_eq!(
2348 store
2349 .find_by_where("updated", "2026-05-02T16:30:00Z")
2350 .unwrap()
2351 .len(),
2352 1
2353 );
2354
2355 // A different instant does not match.
2356 assert!(store
2357 .find_by_where("created", "2026-05-01T00:00:01Z")
2358 .unwrap()
2359 .is_empty());
2360 // A non-RFC3339 query value never matches a real timestamp.
2361 assert!(store
2362 .find_by_where("created", "2026-05-01")
2363 .unwrap()
2364 .is_empty());
2365 }
2366
2367 #[test]
2368 fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
2369 // The O(entities-in-layer) contract: a layer-scoped where read must walk
2370 // ONLY the named layer's subtree. Proven structurally — a *malformed*
2371 // sidecar in another layer would make `read_type_index` error if it were
2372 // read, so a scoped read that succeeds (and excludes that record) is
2373 // proof the other layer's I/O never happened.
2374 let dir = empty_store();
2375 let root = dir.path();
2376 write(
2377 root,
2378 "records/companies/index.jsonl",
2379 &jsonl_line(
2380 "records/companies/acme.md",
2381 "company",
2382 "Acme",
2383 ",\"domain\":\"acme.com\"",
2384 ),
2385 );
2386 // Same field/value in the sources layer — but the sidecar is corrupt.
2387 write(
2388 root,
2389 "sources/emails/index.jsonl",
2390 "{ this is not valid json and would error if read }\n",
2391 );
2392 let store = open(&dir);
2393
2394 // Scoped to records: the corrupt sources sidecar is out of scope, so the
2395 // read succeeds and returns only the records-layer match.
2396 let in_records = store
2397 .find_by_where_in("domain", "acme.com", Some(Layer::Records))
2398 .expect("a records-scoped read must not touch the sources sidecar");
2399 assert_eq!(
2400 rels(
2401 &in_records
2402 .iter()
2403 .map(|r| r.path.clone())
2404 .collect::<Vec<_>>()
2405 ),
2406 vec!["records/companies/acme.md".to_string()]
2407 );
2408
2409 // The store-wide read DOES reach the corrupt sidecar and surfaces it as
2410 // a parse error — confirming the corrupt file is genuinely in the tree
2411 // and that only the layer scope spares it.
2412 let store_wide = store.find_by_where("domain", "acme.com");
2413 assert!(
2414 matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
2415 "unscoped read walks every layer and hits the corrupt sidecar"
2416 );
2417
2418 // Scoping to the layer that holds only the corrupt sidecar still errors
2419 // (the scope includes it), proving the scope is a real subtree bound and
2420 // not a silent "skip anything that fails".
2421 let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
2422 assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
2423 }
2424
2425 #[test]
2426 fn find_by_where_in_missing_layer_is_empty_not_an_error() {
2427 // A layer-scoped read over a layer folder that does not exist yet must
2428 // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
2429 // error from `ignore` over a nonexistent path.
2430 let dir = empty_store();
2431 let root = dir.path();
2432 write(
2433 root,
2434 "records/contacts/index.jsonl",
2435 &jsonl_line(
2436 "records/contacts/sarah.md",
2437 "contact",
2438 "Sarah",
2439 ",\"city\":\"denver\"",
2440 ),
2441 );
2442 let store = open(&dir);
2443
2444 // `wiki/` was never created.
2445 let in_wiki = store
2446 .find_by_where_in("city", "denver", Some(Layer::Wiki))
2447 .expect("missing layer subtree is empty, not an error");
2448 assert!(in_wiki.is_empty());
2449
2450 // Same query scoped to the layer that has the record still finds it.
2451 let in_records = store
2452 .find_by_where_in("city", "denver", Some(Layer::Records))
2453 .unwrap();
2454 assert_eq!(in_records.len(), 1);
2455 }
2456
2457 // ── abs_path / rel_path ──────────────────────────────────────────────────
2458
2459 #[test]
2460 fn abs_and_rel_path_roundtrip() {
2461 let dir = empty_store();
2462 let store = open(&dir);
2463 let rel = Path::new("records/contacts/sarah.md");
2464 let abs = store.abs_path(rel);
2465 assert_eq!(abs, dir.path().join(rel));
2466 assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
2467
2468 // An absolute path is passed through unchanged by abs_path.
2469 assert_eq!(store.abs_path(&abs), abs);
2470
2471 // A path outside the store has no store-relative form.
2472 assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
2473 }
2474
2475 // ── infer_type_from_path (inverse of default_type_folder) ────────────────
2476
2477 #[test]
2478 fn infer_type_maps_every_recognized_folder_back_to_its_type() {
2479 let cases = [
2480 ("sources/emails/x.md", "email"),
2481 ("sources/transcripts/x.md", "transcript"),
2482 ("sources/docs/x.md", "pdf-source"),
2483 ("records/contacts/x.md", "contact"),
2484 ("records/companies/x.md", "company"),
2485 ("records/expenses/x.md", "expense"),
2486 ("records/meetings/x.md", "meeting"),
2487 ("records/decisions/x.md", "decision"),
2488 ("records/invoices/x.md", "invoice"),
2489 // Any wiki sub-folder infers `wiki-page` regardless of the topic name.
2490 ("wiki/topics/x.md", "wiki-page"),
2491 ("wiki/pricing/x.md", "wiki-page"),
2492 ];
2493 for (path, expected) in cases {
2494 assert_eq!(
2495 infer_type_from_path(Path::new(path)).as_deref(),
2496 Some(expected),
2497 "path {path} should infer type {expected}"
2498 );
2499 }
2500 }
2501
2502 #[test]
2503 fn infer_type_round_trips_with_default_type_folder() {
2504 // The canonical invariant: inference is the inverse of the forward map.
2505 // Every recognized type, routed through `default_type_folder` and then
2506 // back through `infer_type_from_path`, must return the original type.
2507 // `wiki-page` is the one many-to-one case (every topic folder maps back
2508 // to `wiki-page`), so its forward folder still round-trips.
2509 let recognized = [
2510 "email",
2511 "transcript",
2512 "pdf-source",
2513 "contact",
2514 "company",
2515 "expense",
2516 "meeting",
2517 "decision",
2518 "invoice",
2519 "wiki-page",
2520 ];
2521 for type_ in recognized {
2522 let folder = default_type_folder(type_);
2523 let file = folder.join("x.md");
2524 assert_eq!(
2525 infer_type_from_path(&file).as_deref(),
2526 Some(type_),
2527 "recognized type {type_} (folder {folder:?}) must round-trip"
2528 );
2529 }
2530 }
2531
2532 #[test]
2533 fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
2534 // Regression guard for the CLI/core divergence: `default_type_folder`'s
2535 // unrecognized fallback is the BARE type name (`task → records/task`,
2536 // `tasks → records/tasks`). Inference must NOT singularize, or a custom
2537 // type would not round-trip (e.g. `records/tasks` → `task` would clash
2538 // with `default_type_folder("task") → records/task`).
2539 for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
2540 let folder = default_type_folder(custom);
2541 assert_eq!(folder, PathBuf::from("records").join(custom));
2542 let file = folder.join("x.md");
2543 assert_eq!(
2544 infer_type_from_path(&file).as_deref(),
2545 Some(custom),
2546 "custom type {custom} must round-trip verbatim (no singularization)"
2547 );
2548 }
2549
2550 // The specific case named in the finding: a plural custom folder keeps
2551 // its trailing `s`; it is NOT singularized to `task`.
2552 assert_eq!(
2553 infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
2554 Some("tasks"),
2555 "records/tasks must infer `tasks`, not `task`"
2556 );
2557 }
2558
2559 #[test]
2560 fn infer_type_requires_three_component_layer_folder_file_shape() {
2561 // Fewer than 3 components: a file directly under a layer has no
2562 // type-folder, so inference yields None (matches the old CLI contract).
2563 assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
2564 assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
2565 assert_eq!(infer_type_from_path(Path::new("wiki/x.md")), None);
2566 assert_eq!(infer_type_from_path(Path::new("x.md")), None);
2567 // Unknown leading layer is never inferred.
2568 assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
2569 // Deeper paths still infer from the first type-folder segment (e.g. a
2570 // sharded record under records/expenses/2026/05/x.md).
2571 assert_eq!(
2572 infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
2573 Some("expense"),
2574 );
2575 }
2576}