dbmd_core/store.rs
1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (embedded ripgrep,
12//! presence-only) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18
19use std::collections::BTreeMap;
20use std::path::{Path, PathBuf};
21
22use chrono::{DateTime, Datelike, FixedOffset};
23use grep::regex::RegexMatcher;
24use grep::searcher::sinks::UTF8;
25use grep::searcher::Searcher;
26use ignore::WalkBuilder;
27
28use crate::index::IndexRecord;
29use crate::parser::{parse_db_md, Config, Frontmatter};
30
31/// Basenames that are never content files: the config marker and the two
32/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
33/// content layers never mistakes a catalog for a record.
34const NON_CONTENT_BASENAMES: [&str; 3] = ["DB.md", "index.md", "log.md"];
35
36/// The complete machine-twin sidecar that backs every structured read.
37const TYPE_INDEX_FILE: &str = "index.jsonl";
38
39/// Returned when a path is opened as a store but has no `DB.md` at its root.
40/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
41#[derive(Debug, thiserror::Error)]
42#[error("not a db.md store: {path} has no DB.md")]
43pub struct NotAStore {
44 /// The path that was inspected.
45 pub path: PathBuf,
46}
47
48/// Errors from store-level operations (walk, locate, shard, sidecar read).
49#[derive(Debug, thiserror::Error)]
50pub enum StoreError {
51 /// A sidecar `index.jsonl` could not be read or parsed.
52 #[error("failed to read type index {path}: {message}")]
53 BadTypeIndex {
54 /// The sidecar file.
55 path: PathBuf,
56 /// What went wrong.
57 message: String,
58 },
59
60 /// A required date field for sharding was absent or unparseable, and there
61 /// was no usable fallback.
62 #[error("cannot compute shard path for {file}: no usable date field")]
63 NoShardDate {
64 /// The file being placed.
65 file: PathBuf,
66 },
67
68 /// An embedded-ripgrep scan failed to start or run.
69 #[error("search failed under {root}: {message}")]
70 Search {
71 /// The root the scan ran under.
72 root: PathBuf,
73 /// What went wrong.
74 message: String,
75 },
76
77 /// An underlying I/O failure.
78 #[error(transparent)]
79 Io(#[from] std::io::Error),
80}
81
82/// The three canonical layers of a db.md store.
83///
84/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
85/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
86/// declaration order (`Sources` < `Records` < `Wiki`) is the sort order.
87#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
88pub enum Layer {
89 /// `sources/` — raw evidence; immutable; date-sharded at scale.
90 Sources,
91 /// `records/` — atomic typed data; entity types flat, event types sharded.
92 Records,
93 /// `wiki/` — curator-synthesized narrative; flat.
94 Wiki,
95}
96
97impl Layer {
98 /// The on-disk folder name for this layer (`"sources"` / `"records"` /
99 /// `"wiki"`).
100 pub fn dir_name(self) -> &'static str {
101 match self {
102 Layer::Sources => "sources",
103 Layer::Records => "records",
104 Layer::Wiki => "wiki",
105 }
106 }
107
108 /// Parse a layer from its folder name; `None` for anything else.
109 pub fn from_dir_name(name: &str) -> Option<Self> {
110 match name {
111 "sources" => Some(Layer::Sources),
112 "records" => Some(Layer::Records),
113 "wiki" => Some(Layer::Wiki),
114 _ => None,
115 }
116 }
117
118 /// Every layer, in canonical order.
119 pub fn all() -> [Layer; 3] {
120 [Layer::Sources, Layer::Records, Layer::Wiki]
121 }
122}
123
124/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
125///
126/// Construct via [`Store::open`]; that is the only path in, and it validates
127/// the `DB.md` marker so downstream code can assume a real store.
128#[derive(Debug, Clone)]
129pub struct Store {
130 /// The store root (the directory containing `DB.md`).
131 pub root: PathBuf,
132 /// The parsed `DB.md` config (agent instructions, policies, schemas).
133 pub config: Config,
134}
135
136impl Store {
137 /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
138 /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
139 /// count (the lowercase name refers to the project/spec, not the marker).
140 pub fn is_db_md_store(path: &Path) -> bool {
141 // Read the directory and match the *stored* filename byte-for-byte.
142 // `path.join("DB.md").exists()` would lie on a case-insensitive
143 // filesystem (macOS default), where a lowercase `db.md` answers a
144 // `DB.md` probe. `read_dir` returns the real on-disk name, so the
145 // exact-match check is correct on both case-sensitive (Linux) and
146 // case-insensitive filesystems.
147 let entries = match std::fs::read_dir(path) {
148 Ok(entries) => entries,
149 Err(_) => return false,
150 };
151 for entry in entries.flatten() {
152 if entry.file_name() == "DB.md" {
153 // A directory literally named `DB.md` is not the marker.
154 match entry.file_type() {
155 Ok(ft) if ft.is_dir() => return false,
156 Ok(_) => return true,
157 Err(_) => return false,
158 }
159 }
160 }
161 false
162 }
163
164 /// Open `path` as a db.md store and require `DB.md` to be readable and
165 /// parseable. Normal commands should enter through this strict gate so a
166 /// damaged config cannot silently disable schema or policy rules.
167 pub fn open_strict(path: &Path) -> crate::Result<Store> {
168 if !Store::is_db_md_store(path) {
169 return Err(NotAStore {
170 path: path.to_path_buf(),
171 }
172 .into());
173 }
174 let db_md = path.join("DB.md");
175 let text = std::fs::read_to_string(&db_md)?;
176 let config = parse_db_md(&text, &db_md)?;
177 Ok(Store {
178 root: path.to_path_buf(),
179 config,
180 })
181 }
182
183 /// Open `path` as a db.md store: confirm the `DB.md` marker (else
184 /// [`NotAStore`]) and parse the `DB.md` config when possible. This is the
185 /// lenient validation-oriented open path: a damaged `DB.md` still marks the
186 /// directory as a store so `dbmd validate` can report the config error as an
187 /// issue. Normal CLI commands should use [`Store::open_strict`] instead.
188 pub fn open(path: &Path) -> Result<Store, NotAStore> {
189 if !Store::is_db_md_store(path) {
190 return Err(NotAStore {
191 path: path.to_path_buf(),
192 });
193 }
194 let db_md = path.join("DB.md");
195 // The marker exists; parse its config. A read or parse failure leaves
196 // the store openable with default config rather than masquerading as
197 // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
198 // DB.md is `dbmd validate`'s job to report, not `open`'s.
199 let config = match std::fs::read_to_string(&db_md) {
200 Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
201 Err(_) => Config::default(),
202 };
203 Ok(Store {
204 root: path.to_path_buf(),
205 config,
206 })
207 }
208
209 /// **SWEEP.** Recursively iterate every `.md` content file across
210 /// `sources/`, `records/`, and `wiki/`, skipping hidden dirs and `log/`.
211 /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
212 /// the interactive loop.
213 pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
214 // Only the three content layers — never root meta files (`DB.md`,
215 // `index.md`, `log.md`) and never `log/`, which live at root and are
216 // outside every layer dir.
217 let mut out = Vec::new();
218 for layer in Layer::all() {
219 out.extend(self.walk_layer(layer)?);
220 }
221 out.sort();
222 Ok(out)
223 }
224
225 /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
226 pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
227 let layer_root = self.root.join(layer.dir_name());
228 if !layer_root.is_dir() {
229 return Ok(Vec::new());
230 }
231 self.walk_content_md(&layer_root)
232 }
233
234 /// Enumerate every `.md` file in a single type-folder, **recursing through
235 /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
236 /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
237 pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
238 let abs = self.resolve_under_root(type_folder);
239 if !abs.is_dir() {
240 return Ok(Vec::new());
241 }
242 self.walk_content_md(&abs)
243 }
244
245 /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
246 /// (descending), ties broken by store-relative path (ascending) — a total
247 /// order, so write-through and rebuild never disagree on #500 vs #501.
248 ///
249 /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
250 /// `index rebuild`. The write-through path never calls this. The
251 /// cap-selection primitive for the 500-entry `index.md` browse view.
252 pub fn recent_in_type_folder(
253 &self,
254 type_folder: &Path,
255 n: usize,
256 ) -> Result<Vec<PathBuf>, StoreError> {
257 let files = self.walk_type_folder(type_folder)?;
258 // (updated, rel-path) for each file. Files missing/unparseable
259 // `updated` sort *after* dated ones (None last), then by path — so they
260 // are deterministically the lowest-priority candidates for the cap, not
261 // dropped silently. The total order (updated desc, path asc) is what
262 // keeps write-through and rebuild agreeing on #500 vs #501.
263 let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
264 .into_iter()
265 .map(|rel| {
266 let updated = self.read_updated(&self.abs_path(&rel));
267 (updated, rel)
268 })
269 .collect();
270 keyed.sort_by(|a, b| {
271 // `updated` descending: newest first. `None` is treated as the
272 // oldest possible, so dated files always win a cap slot over
273 // undated ones.
274 let by_updated = b.0.cmp(&a.0);
275 by_updated.then_with(|| a.1.cmp(&b.1))
276 });
277 keyed.truncate(n);
278 Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
279 }
280
281 /// The shard/flat predicate: true if the type date-shards, false if it
282 /// stays flat. True for source types and event record types
283 /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
284 /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
285 /// dedup-bounded entity types (`contact`/`company`/`decision`) and `wiki/`.
286 pub fn type_shards(&self, type_: &str) -> bool {
287 // Built-in classification. Sharding is a property of the *type*:
288 // - source types carry a primary date field and shard;
289 // - event record types track business volume and shard;
290 // - dedup-bounded entity types and curation-bounded wiki stay flat.
291 // NOTE: the SPEC's `DB.md ## Schemas` `shard: by-date` override has no
292 // representation in the frozen `Schema`/`FieldSpec` types (no shard
293 // flag), so it cannot be consulted here yet — see the store findings.
294 matches!(
295 type_,
296 // source types
297 "email" | "transcript" | "pdf-source"
298 // event record types (canonical)
299 | "expense" | "invoice" | "meeting"
300 // event record types (recognized custom, per the plan)
301 | "order" | "ticket" | "transaction"
302 )
303 }
304
305 /// Compute the canonical write path for a new file. For a sharding type
306 /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
307 /// primary date field (`email.date`, `expense.date`, … fallback `created`)
308 /// under the type folder; flat types and `wiki/` get no shard segment.
309 /// Deterministic + stable: same input → same path, so a record never moves
310 /// once written.
311 pub fn shard_path_for(
312 &self,
313 type_: &str,
314 frontmatter: &Frontmatter,
315 name: &str,
316 ) -> Result<PathBuf, StoreError> {
317 self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
318 }
319
320 /// Like [`Store::shard_path_for`], but compute the path under an explicit,
321 /// caller-resolved type-folder rather than the canonical default. This lets a
322 /// write surface honour an agent-supplied conforming sub-folder — e.g.
323 /// `wiki/projects/`, `wiki/people/`, `wiki/synthesis/` (the SPEC files a
324 /// `wiki-page` under `wiki/<topic>/`, i.e. ANY topic sub-folder, not only the
325 /// `wiki/topics` default) — while still applying date-sharding for sharding
326 /// types. The folder must be a conforming `<layer>/<type-folder>` (2
327 /// components, recognized layer); the caller is responsible for that (see the
328 /// CLI's `resolve_write_path`), so it is taken as given here.
329 ///
330 /// Sharding is still a property of the *type*: a sharding type gets the
331 /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
332 pub fn shard_path_in(
333 &self,
334 folder: &Path,
335 type_: &str,
336 frontmatter: &Frontmatter,
337 name: &str,
338 ) -> Result<PathBuf, StoreError> {
339 let folder = folder.to_path_buf();
340 let filename = ensure_md_extension(name);
341
342 if !self.type_shards(type_) {
343 // Flat type (entity records, wiki, decisions): no shard segment.
344 return Ok(folder.join(filename));
345 }
346
347 // Sharding type: derive <YYYY>/<MM> from the primary date field, with
348 // `created` as the universal fallback. Reading the public `Frontmatter`
349 // fields directly (typed `created`/`updated` + raw `extra`) avoids the
350 // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
351 let (year, month) = self
352 .primary_shard_segment(type_, frontmatter)
353 .ok_or_else(|| StoreError::NoShardDate {
354 file: folder.join(&filename),
355 })?;
356
357 Ok(folder.join(year).join(month).join(filename))
358 }
359
360 /// Find files with an incoming wiki-link to `target`, via **embedded
361 /// ripgrep** for `[[target]]` across all layers. Loop-fast; no whole-graph
362 /// build. Returns store-relative paths.
363 pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
364 // A single target is just the degenerate batch case — one alternation
365 // arm, one store scan. Routing through `find_links_to_any` keeps the
366 // pattern construction and the scan loop in exactly one place. The
367 // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
368 // single alloc on this single-target convenience path; the batch path
369 // validate.rs rides is untouched).
370 self.find_links_to_any(&[target.to_path_buf()])
371 }
372
373 /// Find every file with an incoming wiki-link to **any** of `targets`, in a
374 /// **single embedded-ripgrep pass** over the store (one `.md` walk, one
375 /// presence-only scan per file). This is the batch incoming-linker finder the
376 /// working-set [`crate::validate::validate_working_set`] sits on: it must find
377 /// the linkers for the *whole* changed set without paying a full store read
378 /// per changed object. Cost is therefore one store scan (O(store)), NOT
379 /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
380 /// in a loop would reread every `.md` once per target and is the exact
381 /// `O(changed × store)` blow-up this method exists to prevent. Returns
382 /// store-relative paths (deduped, sorted).
383 ///
384 /// Why content scan and not the sidecar `links` field: the sidecar projects
385 /// only the frontmatter `links:` array, so it misses edges written in the
386 /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
387 /// arbitrary path therefore requires reading file content — the same reason
388 /// the single-target finder uses ripgrep.
389 pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
390 // The wiki-link doctrine: a link is the full store-relative path, no
391 // `.md` extension. A reference to a target therefore appears literally
392 // as `[[<target>]]`, optionally with a `|display` suffix and (warned
393 // but accepted) a trailing `.md`. Build ONE regex that matches all
394 // accepted spellings of an incoming link to ANY target, escaping each
395 // target so path separators / dots stay literal and the alternation
396 // arms keep their boundaries (a link to `sarah` never matches
397 // `sarah-chen`).
398 let mut arms: Vec<String> = Vec::new();
399 for target in targets {
400 let target_str = path_to_link_str(target);
401 if target_str.is_empty() {
402 continue;
403 }
404 // [[ <target> (.md)? ( | display )? ]]
405 arms.push(format!(
406 r"\[\[{}(\.md)?(\|[^\]]*)?\]\]",
407 regex::escape(&target_str)
408 ));
409 }
410 // No usable targets → no possible incoming links, and an empty pattern
411 // would compile to a match-everything regex. Short-circuit instead.
412 if arms.is_empty() {
413 return Ok(Vec::new());
414 }
415 let pattern = arms.join("|");
416
417 let matcher = RegexMatcher::new(&pattern).map_err(|e| StoreError::Search {
418 root: self.root.clone(),
419 message: format!("invalid backlink pattern: {e}"),
420 })?;
421
422 let mut hits = std::collections::BTreeSet::new();
423 // Scan every `.md` file in the store (skip hidden + `log/`), including
424 // `index.md` catalogs — an incoming reference is wherever the literal
425 // link text lives; the caller decides relevance. ONE walk for the whole
426 // target set; per file we stop at the first hit (presence is all we
427 // need), so a file that links to several targets is read once, not once
428 // per target.
429 for rel in self.walk_all_md()? {
430 let abs = self.abs_path(&rel);
431 let mut matched_here = false;
432 let mut searcher = Searcher::new();
433 let res = searcher.search_path(
434 &matcher,
435 &abs,
436 UTF8(|_lnum, _line| {
437 matched_here = true;
438 // Stop at the first hit: presence is all we need.
439 Ok(false)
440 }),
441 );
442 if let Err(e) = res {
443 return Err(StoreError::Search {
444 root: self.root.clone(),
445 message: format!("search failed in {}: {e}", abs.display()),
446 });
447 }
448 if matched_here {
449 hits.insert(rel);
450 }
451 }
452 Ok(hits.into_iter().collect())
453 }
454
455 /// Candidate set for a `type` query: read the relevant type-folder
456 /// `index.jsonl` sidecar(s) and return their records. Complete and
457 /// cold-cache-proof — NOT a walk-and-parse or a frontmatter ripgrep scan,
458 /// and **never a store-wide read**. The common path is one sequential read
459 /// of the canonical type-folder sidecar (O(entities)); when that sidecar is
460 /// absent the read is bounded to the type's single layer subtree
461 /// (O(entities-in-layer)), so a `--type proposal` query before that folder
462 /// has been indexed still stays inside the interactive loop's O(entities)
463 /// contract instead of fanning out across every sidecar in the store.
464 pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
465 // Read the type's canonical-folder sidecar when it exists (the common,
466 // O(entities) path). Otherwise fall back to the sidecars of the *one
467 // layer* the type belongs to and filter by `type` — complete for records
468 // filed under a non-canonical folder name within that layer (e.g. a
469 // custom `proposal` filed in `records/proposals/` when the canonical
470 // guess is the bare `records/proposal/`), without the whole-store
471 // sidecar fan-out that would break the interactive loop's O(entities)
472 // contract. A type lives in exactly one layer, and `default_type_folder`
473 // always encodes it (recognized → its SPEC layer; unrecognized →
474 // `records/`), so the fallback walk is bounded to that layer's subtree —
475 // O(entities-in-layer), never O(store). Either way: sequential, complete
476 // sidecar reads, never a walk-and-parse of the tree.
477 let canonical_folder = default_type_folder(type_);
478 let canonical = self.root.join(&canonical_folder).join(TYPE_INDEX_FILE);
479 let records = if canonical.is_file() {
480 self.read_type_index(&canonical)?
481 } else {
482 self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?
483 };
484 Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
485 }
486
487 /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
488 /// every type-folder `index.jsonl` sidecar and filter their records. The
489 /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
490 /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
491 pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
492 self.find_by_where_in(key, value, None)
493 }
494
495 /// Candidate set for a `key=value` frontmatter query, **scoped to one
496 /// layer** when `layer` is `Some`: the sidecar walk is confined to that
497 /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
498 /// not O(store records). `None` keeps the store-wide read.
499 ///
500 /// This is what makes `--in <layer>` an I/O scope, not just a result
501 /// filter: a `--where`-only query (no `--type`) used to read every sidecar
502 /// in the store and narrow by layer in memory, breaking the O(entities)
503 /// contract the interactive loop depends on. With a layer in hand we walk
504 /// only that layer's sidecars.
505 pub fn find_by_where_in(
506 &self,
507 key: &str,
508 value: &str,
509 layer: Option<Layer>,
510 ) -> Result<Vec<IndexRecord>, StoreError> {
511 // A `key=value` query can target any frontmatter field across any type,
512 // so within the chosen subtree we still read every type-folder sidecar
513 // and filter. The layer (when given) bounds *which* subtree, turning a
514 // whole-store walk into a single-layer walk.
515 let records = self.read_all_type_indexes_in(layer)?;
516 Ok(records
517 .into_iter()
518 .filter(|r| record_matches_field(r, key, value))
519 .collect())
520 }
521
522 /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
523 /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
524 /// else store-wide. Sequential, complete sidecar reads — never a
525 /// walk-and-parse of the content tree.
526 ///
527 /// This is the unfiltered sidecar-enumeration primitive the relationship
528 /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
529 /// candidate set to the relevant layer (or the whole store) without opening
530 /// the content tree, then confirms each candidate's edge by parsing the file.
531 pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
532 self.read_all_type_indexes_in(layer)
533 }
534
535 /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
536 /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
537 /// primitive every structured query sits on.
538 pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
539 let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
540 path: index_jsonl.to_path_buf(),
541 message: e.to_string(),
542 })?;
543
544 // Last-write-wins by `path` over un-compacted lines: a later line for
545 // the same path supersedes an earlier one (the jsonl is append-mostly
546 // and only compacted on rebuild). Blank lines are skipped; a non-blank
547 // line that is not a valid IndexRecord is a hard parse error.
548 let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
549 for (i, line) in text.lines().enumerate() {
550 let trimmed = line.trim();
551 if trimmed.is_empty() {
552 continue;
553 }
554 let record: IndexRecord =
555 serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
556 path: index_jsonl.to_path_buf(),
557 message: format!("line {}: {e}", i + 1),
558 })?;
559 by_path.insert(record.path.clone(), record);
560 }
561 // BTreeMap keyed by path → records emerge sorted by path ascending,
562 // a deterministic order independent of line order in the file.
563 Ok(by_path.into_values().collect())
564 }
565
566 /// Resolve a store-relative path to its absolute on-disk path under
567 /// [`root`](Store::root).
568 pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
569 // `Path::join` returns `store_relative` unchanged if it is already
570 // absolute, so passing an absolute path through is a no-op.
571 self.root.join(store_relative)
572 }
573
574 /// Convert an absolute path under the store into its store-relative form.
575 pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
576 abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
577 }
578
579 // ── Private helpers ─────────────────────────────────────────────────────
580
581 /// Resolve a caller-supplied folder path (store-relative or absolute) to an
582 /// absolute path under the store root.
583 fn resolve_under_root(&self, folder: &Path) -> PathBuf {
584 if folder.is_absolute() {
585 folder.to_path_buf()
586 } else {
587 self.root.join(folder)
588 }
589 }
590
591 /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
592 /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
593 /// and type-folder walks.
594 fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
595 let mut out = Vec::new();
596 for entry in self.md_walker(root).build() {
597 let entry = entry.map_err(|e| StoreError::Search {
598 root: root.to_path_buf(),
599 message: e.to_string(),
600 })?;
601 if !is_file_entry(&entry) {
602 continue;
603 }
604 let path = entry.path();
605 if !has_md_extension(path) {
606 continue;
607 }
608 if is_non_content_basename(path) {
609 continue;
610 }
611 if let Some(rel) = self.rel_path(path) {
612 out.push(rel);
613 }
614 }
615 out.sort();
616 Ok(out)
617 }
618
619 /// Walk the whole store for **every** `.md` file (including `index.md`),
620 /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
621 /// scan, where the literal link text can live in any markdown file.
622 fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
623 let mut out = Vec::new();
624 for entry in self.md_walker(&self.root).build() {
625 let entry = entry.map_err(|e| StoreError::Search {
626 root: self.root.clone(),
627 message: e.to_string(),
628 })?;
629 if !is_file_entry(&entry) {
630 continue;
631 }
632 let path = entry.path();
633 if !has_md_extension(path) {
634 continue;
635 }
636 if self.is_in_log_dir(path) {
637 continue;
638 }
639 if let Some(rel) = self.rel_path(path) {
640 out.push(rel);
641 }
642 }
643 out.sort();
644 Ok(out)
645 }
646
647 /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
648 /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
649 /// read with last-write-wins by path; across sidecars, paths are disjoint by
650 /// construction (one sidecar per folder), so a plain concatenation preserves
651 /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
652 /// is what keeps `find_by_where_in` O(entities-in-layer).
653 fn read_all_type_indexes_in(
654 &self,
655 layer: Option<Layer>,
656 ) -> Result<Vec<IndexRecord>, StoreError> {
657 let mut out = Vec::new();
658 for sidecar in self.find_type_index_files_in(layer)? {
659 out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
660 }
661 Ok(out)
662 }
663
664 /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
665 /// whole store (skip hidden + `log/`), returning store-relative paths. The
666 /// walk root is `<root>/<layer>/` for a scoped read and `self.root` for the
667 /// store-wide read; a non-existent layer subtree yields no sidecars rather
668 /// than walking a missing path.
669 fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
670 let walk_root = match layer {
671 Some(l) => self.root.join(l.dir_name()),
672 None => self.root.clone(),
673 };
674 // A scoped walk over a layer folder that does not exist yet must be an
675 // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
676 // error from `ignore` over a nonexistent path.
677 if !walk_root.is_dir() {
678 return Ok(Vec::new());
679 }
680 let mut out = Vec::new();
681 let mut builder = WalkBuilder::new(&walk_root);
682 builder.standard_filters(false).hidden(true);
683 for entry in builder.build() {
684 let entry = entry.map_err(|e| StoreError::Search {
685 root: walk_root.clone(),
686 message: e.to_string(),
687 })?;
688 if !is_file_entry(&entry) {
689 continue;
690 }
691 let path = entry.path();
692 if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
693 continue;
694 }
695 if self.is_in_log_dir(path) {
696 continue;
697 }
698 if let Some(rel) = self.rel_path(path) {
699 out.push(rel);
700 }
701 }
702 out.sort();
703 Ok(out)
704 }
705
706 /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
707 /// OFF (a SWEEP must see every file even if the store is a git repo with a
708 /// `.gitignore`), but hidden files/dirs are skipped.
709 fn md_walker(&self, root: &Path) -> WalkBuilder {
710 let mut builder = WalkBuilder::new(root);
711 builder.standard_filters(false).hidden(true);
712 builder
713 }
714
715 /// True if an absolute path lives under the store's root-level `log/`
716 /// rotation-archive directory.
717 fn is_in_log_dir(&self, abs: &Path) -> bool {
718 match self.rel_path(abs) {
719 Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
720 None => false,
721 }
722 }
723
724 /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
725 /// returning `None` when absent/unparseable. A self-contained reader (does
726 /// not depend on the not-yet-implemented `parser::read_file`); parses the
727 /// leading `---`-fenced YAML block with the same engine the parser uses.
728 fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
729 let text = std::fs::read_to_string(abs).ok()?;
730 let yaml = frontmatter_block(&text)?;
731 let value: serde_norway::Value = serde_norway::from_str(yaml).ok()?;
732 let raw = value.get("updated")?;
733 value_to_datetime(raw)
734 }
735
736 /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
737 /// date field with a `created` fallback. Reads the public `Frontmatter`
738 /// fields directly. `None` when no usable date is present.
739 fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
740 // Try the type's primary date field first.
741 if let Some(field) = primary_date_field(type_) {
742 if let Some(v) = fm.extra.get(field) {
743 if let Some(seg) = value_to_year_month(v) {
744 return Some(seg);
745 }
746 }
747 }
748 // Universal fallback: the typed `created` timestamp.
749 fm.created
750 .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
751 }
752}
753
754// ── Free helpers (no `self`) ────────────────────────────────────────────────
755
756/// True if a walk entry is a regular file (not a dir / symlink-to-dir).
757fn is_file_entry(entry: &ignore::DirEntry) -> bool {
758 entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
759}
760
761/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
762/// lowercase `.md`).
763fn has_md_extension(path: &Path) -> bool {
764 path.extension().and_then(|e| e.to_str()) == Some("md")
765}
766
767/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
768/// `log.md`) that the content walks must skip.
769fn is_non_content_basename(path: &Path) -> bool {
770 match path.file_name().and_then(|n| n.to_str()) {
771 Some(name) => NON_CONTENT_BASENAMES.contains(&name),
772 None => false,
773 }
774}
775
776/// Append `.md` to a bare name; leave an existing `.md` untouched.
777fn ensure_md_extension(name: &str) -> String {
778 if name.ends_with(".md") {
779 name.to_string()
780 } else {
781 format!("{name}.md")
782 }
783}
784
785/// Render a store-relative path as a wiki-link target string with `/`
786/// separators (never `\`), no leading `./`, no trailing `.md`.
787fn path_to_link_str(target: &Path) -> String {
788 let mut parts: Vec<String> = Vec::new();
789 for comp in target.components() {
790 if let std::path::Component::Normal(os) = comp {
791 if let Some(s) = os.to_str() {
792 parts.push(s.to_string());
793 }
794 }
795 }
796 let mut joined = parts.join("/");
797 if let Some(stripped) = joined.strip_suffix(".md") {
798 joined = stripped.to_string();
799 }
800 joined
801}
802
803/// The canonical default folder for a recognized type, per the SPEC type table
804/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
805/// types fall back to `records/<type>` (the bare type name, no pluralization
806/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
807fn default_type_folder(type_: &str) -> PathBuf {
808 let path = match type_ {
809 // sources
810 "email" => "sources/emails",
811 "transcript" => "sources/transcripts",
812 "pdf-source" => "sources/docs",
813 // records — entities
814 "contact" => "records/contacts",
815 "company" => "records/companies",
816 // records — events
817 "expense" => "records/expenses",
818 "meeting" => "records/meetings",
819 "decision" => "records/decisions",
820 "invoice" => "records/invoices",
821 // wiki — the SPEC type table files a wiki-page under `wiki/<topic>/`,
822 // i.e. ALWAYS a sub-folder, never flat under `wiki/`. A 2-component
823 // `wiki/<file>` path is non-conforming: `index::type_folder_of` /
824 // `validate::type_folder_of` require `<layer>/<type-folder>/<file>` (3
825 // components), so a flat wiki page either crashes write-through
826 // (`on_write` tries to create `index.md` *inside* a file) or is silently
827 // dropped from every catalog by `rebuild_all`. `topic` is the page's
828 // canonical bucket; with only the bare type in hand here, `wiki/topics`
829 // is the deterministic default folder (matches the dogfood store).
830 "wiki-page" => "wiki/topics",
831 // unrecognized: bare type name under records/
832 other => return PathBuf::from("records").join(other),
833 };
834 PathBuf::from(path)
835}
836
837/// The canonical [`Layer`] a `type_` belongs to, derived from its default
838/// type-folder (`email` → `Sources`, `contact` → `Records`, `wiki-page` →
839/// `Wiki`, unrecognized → `Records`). The write path uses this to decide whether
840/// an agent-supplied folder is in the *right* layer for the type before honouring
841/// its sub-folder choice.
842pub fn layer_for_type(type_: &str) -> Layer {
843 layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
844}
845
846/// The [`Layer`] a type-folder path lives in, read from its first component
847/// (`sources/` → `Sources`, `records/` → `Records`, `wiki/` → `Wiki`). Used to
848/// bound [`Store::find_by_type`]'s canonical-folder-absent fallback to a single
849/// layer subtree. Returns `None` for a path with no recognized layer prefix;
850/// every value [`default_type_folder`] produces has one, so in practice this is
851/// always `Some` on the call path — `None` degrades to a store-wide read.
852fn layer_of_folder(folder: &Path) -> Option<Layer> {
853 let first = folder.components().next()?.as_os_str().to_str()?;
854 Layer::from_dir_name(first)
855}
856
857/// Infer a content file's canonical `type` from its store-relative path — the
858/// inverse of [`default_type_folder`] and the single source of truth for
859/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
860///
861/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
862/// shorter path (a file directly under a layer) or an unknown leading layer
863/// yields `None`.
864///
865/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
866/// unrecognized folder the fallback is the **bare folder name verbatim** (no
867/// pluralization/singularization) so it round-trips with `default_type_folder`,
868/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
869/// Singularizing here would break that round-trip (`records/tasks` → `task`
870/// while `default_type_folder("task")` → `records/task`). `wiki/<topic>` always
871/// infers `wiki-page`, since every wiki page is filed under a topic folder.
872pub fn infer_type_from_path(rel: &Path) -> Option<String> {
873 let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
874 let layer = comps.next()?;
875 if !matches!(layer, "sources" | "records" | "wiki") {
876 return None;
877 }
878 let folder = comps.next()?;
879 // The file itself must be a third component (a real type-folder, not the
880 // file sitting directly under the layer).
881 comps.next()?;
882
883 let mapped = match (layer, folder) {
884 ("sources", "emails") => "email",
885 ("sources", "transcripts") => "transcript",
886 ("sources", "docs") => "pdf-source",
887 ("records", "contacts") => "contact",
888 ("records", "companies") => "company",
889 ("records", "expenses") => "expense",
890 ("records", "meetings") => "meeting",
891 ("records", "decisions") => "decision",
892 ("records", "invoices") => "invoice",
893 // Every wiki page is filed under `wiki/<topic>/`; the type is always
894 // `wiki-page` regardless of the topic-folder name.
895 ("wiki", _) => "wiki-page",
896 // Unrecognized folder: the bare name, verbatim. This is the inverse of
897 // `default_type_folder`'s unrecognized fallback (`other → records/other`)
898 // and the round-trip would break if we pluralized/singularized here.
899 (_, other) => other,
900 };
901 Some(mapped.to_string())
902}
903
904/// The primary date field name for a sharding type (the field whose value
905/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
906fn primary_date_field(type_: &str) -> Option<&'static str> {
907 match type_ {
908 "email" => Some("date"),
909 "transcript" => Some("recorded_at"),
910 "pdf-source" => Some("received_at"),
911 "expense" | "invoice" | "meeting" => Some("date"),
912 // recognized custom event types have no canonical date field name; they
913 // fall back to `created`.
914 _ => None,
915 }
916}
917
918/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
919/// string and a YAML-native scalar rendered to string.
920fn value_to_datetime(value: &serde_norway::Value) -> Option<DateTime<FixedOffset>> {
921 let s = yaml_scalar_string(value)?;
922 DateTime::parse_from_rfc3339(s.trim()).ok()
923}
924
925/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
926/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
927/// `2026-05-22T10:00:00-07:00` timestamp both work.
928fn value_to_year_month(value: &serde_norway::Value) -> Option<(String, String)> {
929 let s = yaml_scalar_string(value)?;
930 year_month_from_str(s.trim())
931}
932
933/// `(YYYY, MM)` from the leading `YYYY-MM` of a date string.
934fn year_month_from_str(s: &str) -> Option<(String, String)> {
935 // Hand-roll the leading-`YYYY-MM` parse to avoid a regex compile on the
936 // write path. Require: 4 digits, '-', 2 digits.
937 let bytes = s.as_bytes();
938 if bytes.len() < 7 {
939 return None;
940 }
941 let is_digit = |b: u8| b.is_ascii_digit();
942 if !(is_digit(bytes[0])
943 && is_digit(bytes[1])
944 && is_digit(bytes[2])
945 && is_digit(bytes[3])
946 && bytes[4] == b'-'
947 && is_digit(bytes[5])
948 && is_digit(bytes[6]))
949 {
950 return None;
951 }
952 let month: u8 = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
953 if !(1..=12).contains(&month) {
954 return None;
955 }
956 Some((s[0..4].to_string(), s[5..7].to_string()))
957}
958
959/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
960/// value's compact YAML serialization (covers timestamps that the YAML engine
961/// may surface as a non-string scalar).
962fn yaml_scalar_string(value: &serde_norway::Value) -> Option<String> {
963 if let Some(s) = value.as_str() {
964 return Some(s.to_string());
965 }
966 match value {
967 serde_norway::Value::Null => None,
968 serde_norway::Value::Mapping(_) | serde_norway::Value::Sequence(_) => None,
969 other => serde_norway::to_string(other)
970 .ok()
971 .map(|s| s.trim().to_string()),
972 }
973}
974
975/// The YAML frontmatter block of a file: the text between a leading `---` fence
976/// and the next `---` fence, exclusive. `None` if the file does not open with a
977/// `---` fence on its first line.
978fn frontmatter_block(text: &str) -> Option<&str> {
979 // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
980 let body = text.strip_prefix('\u{feff}').unwrap_or(text);
981 let mut rest = body;
982 // First line must be exactly `---` (allowing trailing CR).
983 let (first, after_first) = split_first_line(rest);
984 if first.trim_end_matches('\r') != "---" {
985 return None;
986 }
987 rest = after_first;
988 let block_start = rest;
989 let mut scanned = 0usize;
990 loop {
991 let (line, after) = split_first_line(rest);
992 if line.trim_end_matches('\r') == "---" {
993 return Some(&block_start[..scanned]);
994 }
995 if after.is_empty() && line.is_empty() {
996 // Reached end of input without a closing fence.
997 return None;
998 }
999 scanned += line.len() + 1; // +1 for the consumed '\n'
1000 if after.is_empty() {
1001 return None;
1002 }
1003 rest = after;
1004 }
1005}
1006
1007/// Split a string into (first line without its trailing `\n`, remainder after
1008/// the `\n`). If there is no newline, the whole string is the line and the
1009/// remainder is empty.
1010fn split_first_line(s: &str) -> (&str, &str) {
1011 match s.find('\n') {
1012 Some(i) => (&s[..i], &s[i + 1..]),
1013 None => (s, ""),
1014 }
1015}
1016
1017/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
1018/// typed columns first and then the flattened `fields` map.
1019fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
1020 match key {
1021 "type" => record.type_ == value,
1022 "summary" => record.summary == value,
1023 "path" => record.path.to_string_lossy() == value,
1024 "created" => timestamp_matches(record.created, value),
1025 "updated" => timestamp_matches(record.updated, value),
1026 "tags" => record.tags.iter().any(|t| t == value),
1027 "links" => record.links.iter().any(|l| l == value),
1028 other => record
1029 .fields
1030 .get(other)
1031 .map(|v| json_value_matches(v, value))
1032 .unwrap_or(false),
1033 }
1034}
1035
1036/// Compare a record's `created`/`updated` instant against a query `value`.
1037///
1038/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1039/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1040/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1041/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1042/// the `Z` form an agent reads straight out of the file would never match. We
1043/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1044/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1045/// can never equal a real timestamp, so it falls through to `false`.
1046fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1047 match (stored, DateTime::parse_from_rfc3339(value)) {
1048 (Some(stored), Ok(queried)) => stored == queried,
1049 _ => false,
1050 }
1051}
1052
1053/// Compare a JSON field value against a query string. A string matches
1054/// verbatim; scalars match their textual form; an array matches if any element
1055/// matches (so a list-valued frontmatter field is membership-queried).
1056fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1057 match v {
1058 serde_json::Value::String(s) => s == value,
1059 serde_json::Value::Bool(b) => b.to_string() == value,
1060 serde_json::Value::Number(n) => n.to_string() == value,
1061 serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1062 // A present-but-null field never matches — consistent with the in-memory
1063 // post-filter (`query::json_value_matches`, which the first `where`
1064 // clause is NOT re-checked against, so the two must agree here or a
1065 // `--where field=` query would return different rows than `--type X
1066 // --where field=`).
1067 serde_json::Value::Null => false,
1068 serde_json::Value::Object(_) => false,
1069 }
1070}
1071
1072#[cfg(test)]
1073mod tests {
1074 use super::*;
1075 use std::fs;
1076 use tempfile::{tempdir, TempDir};
1077
1078 // ── Fixtures ────────────────────────────────────────────────────────────
1079
1080 /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1081 /// store-relative path for convenient assertions.
1082 fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1083 let abs = root.join(rel);
1084 fs::create_dir_all(abs.parent().unwrap()).unwrap();
1085 fs::write(&abs, contents).unwrap();
1086 PathBuf::from(rel)
1087 }
1088
1089 /// A minimal content file with the given `updated` timestamp in frontmatter.
1090 fn content_md(updated: &str) -> String {
1091 format!(
1092 "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1093 )
1094 }
1095
1096 /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1097 /// real parser is exercised).
1098 fn empty_store() -> TempDir {
1099 let dir = tempdir().unwrap();
1100 fs::write(
1101 dir.path().join("DB.md"),
1102 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1103 )
1104 .unwrap();
1105 dir
1106 }
1107
1108 /// Open a store rooted at a TempDir; panics if `open` rejects it.
1109 fn open(dir: &TempDir) -> Store {
1110 Store::open(dir.path()).expect("fixture should be a valid store")
1111 }
1112
1113 fn rels(paths: &[PathBuf]) -> Vec<String> {
1114 paths
1115 .iter()
1116 .map(|p| p.to_string_lossy().replace('\\', "/"))
1117 .collect()
1118 }
1119
1120 // ── Layer ───────────────────────────────────────────────────────────────
1121
1122 #[test]
1123 fn layer_dir_name_and_parse_are_inverse() {
1124 for layer in Layer::all() {
1125 assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1126 }
1127 assert_eq!(Layer::Sources.dir_name(), "sources");
1128 assert_eq!(Layer::Records.dir_name(), "records");
1129 assert_eq!(Layer::Wiki.dir_name(), "wiki");
1130 assert_eq!(Layer::from_dir_name("log"), None);
1131 assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1132 }
1133
1134 #[test]
1135 fn layer_order_is_canonical() {
1136 // stats keys a BTreeMap on Layer; the sort order must be sources<records<wiki.
1137 let mut v = [Layer::Wiki, Layer::Sources, Layer::Records];
1138 v.sort();
1139 assert_eq!(v, [Layer::Sources, Layer::Records, Layer::Wiki]);
1140 }
1141
1142 // ── is_db_md_store / open ────────────────────────────────────────────────
1143
1144 #[test]
1145 fn is_store_true_only_with_uppercase_marker() {
1146 let dir = tempdir().unwrap();
1147 assert!(
1148 !Store::is_db_md_store(dir.path()),
1149 "no marker → not a store"
1150 );
1151
1152 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1153 assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1154 }
1155
1156 #[test]
1157 fn is_store_false_for_lowercase_db_md() {
1158 // The case-sensitivity contract: a lowercase db.md is the spec name, not
1159 // a marker — even on a case-insensitive filesystem where Path::exists
1160 // would lie. This test must pass on macOS (case-insensitive) too.
1161 let dir = tempdir().unwrap();
1162 fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1163 assert!(
1164 !Store::is_db_md_store(dir.path()),
1165 "lowercase db.md must NOT be treated as a store marker"
1166 );
1167 assert!(Store::open(dir.path()).is_err());
1168 }
1169
1170 #[test]
1171 fn is_store_false_when_db_md_is_a_directory() {
1172 let dir = tempdir().unwrap();
1173 fs::create_dir(dir.path().join("DB.md")).unwrap();
1174 assert!(
1175 !Store::is_db_md_store(dir.path()),
1176 "a directory named DB.md is not the file marker"
1177 );
1178 }
1179
1180 #[test]
1181 fn open_rejects_non_store_with_path() {
1182 let dir = tempdir().unwrap();
1183 let err = Store::open(dir.path()).unwrap_err();
1184 assert_eq!(err.path, dir.path());
1185 }
1186
1187 #[test]
1188 fn open_succeeds_and_parses_config() {
1189 let dir = tempdir().unwrap();
1190 // A DB.md whose ## Policies declares a frozen page — proves open()
1191 // actually parsed the config rather than substituting a default.
1192 fs::write(
1193 dir.path().join("DB.md"),
1194 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1195 ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1196 )
1197 .unwrap();
1198 let store = Store::open(dir.path()).unwrap();
1199 assert_eq!(store.root, dir.path());
1200 assert!(
1201 store
1202 .config
1203 .frozen_pages
1204 .iter()
1205 .any(|p| p == Path::new("records/decisions/q1.md")),
1206 "open() must surface DB.md ## Policies, got {:?}",
1207 store.config.frozen_pages
1208 );
1209 }
1210
1211 // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1212
1213 #[test]
1214 fn walk_collects_content_across_layers_skipping_meta_and_log() {
1215 let dir = empty_store();
1216 let root = dir.path();
1217 write(
1218 root,
1219 "sources/emails/2026/05/a.md",
1220 &content_md("2026-05-01T00:00:00Z"),
1221 );
1222 write(
1223 root,
1224 "records/contacts/sarah.md",
1225 &content_md("2026-05-02T00:00:00Z"),
1226 );
1227 write(
1228 root,
1229 "wiki/people/sarah.md",
1230 &content_md("2026-05-03T00:00:00Z"),
1231 );
1232 // Things walk() must SKIP:
1233 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1234 write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1235 write(root, "log.md", "---\ntype: log\n---\n"); // log
1236 write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1237 write(
1238 root,
1239 "sources/.hidden/secret.md",
1240 &content_md("2026-05-09T00:00:00Z"),
1241 ); // hidden dir
1242 write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1243
1244 let store = open(&dir);
1245 let got = rels(&store.walk().unwrap());
1246 assert_eq!(
1247 got,
1248 vec![
1249 "records/contacts/sarah.md".to_string(),
1250 "sources/emails/2026/05/a.md".to_string(),
1251 "wiki/people/sarah.md".to_string(),
1252 ]
1253 );
1254 }
1255
1256 #[test]
1257 fn walk_layer_is_scoped() {
1258 let dir = empty_store();
1259 let root = dir.path();
1260 write(
1261 root,
1262 "sources/emails/2026/05/a.md",
1263 &content_md("2026-05-01T00:00:00Z"),
1264 );
1265 write(
1266 root,
1267 "records/contacts/sarah.md",
1268 &content_md("2026-05-02T00:00:00Z"),
1269 );
1270 let store = open(&dir);
1271
1272 assert_eq!(
1273 rels(&store.walk_layer(Layer::Sources).unwrap()),
1274 vec!["sources/emails/2026/05/a.md".to_string()]
1275 );
1276 assert_eq!(
1277 rels(&store.walk_layer(Layer::Records).unwrap()),
1278 vec!["records/contacts/sarah.md".to_string()]
1279 );
1280 // A layer with no directory is empty, not an error.
1281 assert!(store.walk_layer(Layer::Wiki).unwrap().is_empty());
1282 }
1283
1284 #[test]
1285 fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1286 let dir = empty_store();
1287 let root = dir.path();
1288 write(
1289 root,
1290 "sources/emails/2026/05/a.md",
1291 &content_md("2026-05-01T00:00:00Z"),
1292 );
1293 write(
1294 root,
1295 "sources/emails/2026/06/b.md",
1296 &content_md("2026-06-01T00:00:00Z"),
1297 );
1298 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1299 // A different type folder must not leak in.
1300 write(
1301 root,
1302 "sources/docs/2026/05/c.md",
1303 &content_md("2026-05-04T00:00:00Z"),
1304 );
1305 let store = open(&dir);
1306
1307 let expected = vec![
1308 "sources/emails/2026/05/a.md".to_string(),
1309 "sources/emails/2026/06/b.md".to_string(),
1310 ];
1311 // Relative folder arg.
1312 assert_eq!(
1313 rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1314 expected
1315 );
1316 // Absolute folder arg under the store resolves identically.
1317 assert_eq!(
1318 rels(
1319 &store
1320 .walk_type_folder(&root.join("sources/emails"))
1321 .unwrap()
1322 ),
1323 expected
1324 );
1325 }
1326
1327 // ── recent_in_type_folder ────────────────────────────────────────────────
1328
1329 #[test]
1330 fn recent_orders_by_updated_desc_then_path_and_caps() {
1331 let dir = empty_store();
1332 let root = dir.path();
1333 // newest
1334 write(
1335 root,
1336 "records/meetings/2026/05/c.md",
1337 &content_md("2026-05-03T00:00:00Z"),
1338 );
1339 // tie on updated — path asc decides (a before b)
1340 write(
1341 root,
1342 "records/meetings/2026/05/a.md",
1343 &content_md("2026-05-02T00:00:00Z"),
1344 );
1345 write(
1346 root,
1347 "records/meetings/2026/05/b.md",
1348 &content_md("2026-05-02T00:00:00Z"),
1349 );
1350 // oldest
1351 write(
1352 root,
1353 "records/meetings/2026/04/z.md",
1354 &content_md("2026-04-01T00:00:00Z"),
1355 );
1356 let store = open(&dir);
1357
1358 let all = rels(
1359 &store
1360 .recent_in_type_folder(Path::new("records/meetings"), 10)
1361 .unwrap(),
1362 );
1363 assert_eq!(
1364 all,
1365 vec![
1366 "records/meetings/2026/05/c.md".to_string(), // newest
1367 "records/meetings/2026/05/a.md".to_string(), // tie, path asc
1368 "records/meetings/2026/05/b.md".to_string(),
1369 "records/meetings/2026/04/z.md".to_string(), // oldest
1370 ]
1371 );
1372
1373 // Cap takes the n most-recent.
1374 let top2 = rels(
1375 &store
1376 .recent_in_type_folder(Path::new("records/meetings"), 2)
1377 .unwrap(),
1378 );
1379 assert_eq!(
1380 top2,
1381 vec![
1382 "records/meetings/2026/05/c.md".to_string(),
1383 "records/meetings/2026/05/a.md".to_string(),
1384 ]
1385 );
1386 }
1387
1388 #[test]
1389 fn recent_sorts_undated_files_last() {
1390 let dir = empty_store();
1391 let root = dir.path();
1392 write(
1393 root,
1394 "records/contacts/dated.md",
1395 &content_md("2026-05-01T00:00:00Z"),
1396 );
1397 // No `updated` field at all.
1398 write(
1399 root,
1400 "records/contacts/undated.md",
1401 "---\ntype: contact\nsummary: x\n---\nbody\n",
1402 );
1403 let store = open(&dir);
1404 let got = rels(
1405 &store
1406 .recent_in_type_folder(Path::new("records/contacts"), 10)
1407 .unwrap(),
1408 );
1409 assert_eq!(
1410 got,
1411 vec![
1412 "records/contacts/dated.md".to_string(),
1413 "records/contacts/undated.md".to_string(),
1414 ],
1415 "a file with a real `updated` must outrank one with none"
1416 );
1417 }
1418
1419 // ── type_shards ──────────────────────────────────────────────────────────
1420
1421 #[test]
1422 fn type_shards_classification() {
1423 let dir = empty_store();
1424 let store = open(&dir);
1425 for t in [
1426 "email",
1427 "transcript",
1428 "pdf-source",
1429 "expense",
1430 "invoice",
1431 "meeting",
1432 "order",
1433 "ticket",
1434 "transaction",
1435 ] {
1436 assert!(store.type_shards(t), "{t} should shard");
1437 }
1438 for t in [
1439 "contact",
1440 "company",
1441 "decision",
1442 "wiki-page",
1443 "index",
1444 "log",
1445 "db-md",
1446 "proposal",
1447 ] {
1448 assert!(!store.type_shards(t), "{t} should stay flat");
1449 }
1450 }
1451
1452 // ── shard_path_for ───────────────────────────────────────────────────────
1453
1454 fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
1455 let mut fm = Frontmatter::default();
1456 fm.extra.insert(
1457 key.to_string(),
1458 serde_norway::Value::String(value.to_string()),
1459 );
1460 fm
1461 }
1462
1463 fn fm_with_created(rfc3339: &str) -> Frontmatter {
1464 Frontmatter {
1465 created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
1466 ..Default::default()
1467 }
1468 }
1469
1470 #[test]
1471 fn shard_path_uses_primary_date_field_per_type() {
1472 let dir = empty_store();
1473 let store = open(&dir);
1474
1475 // expense.date → records/expenses/<YYYY>/<MM>/
1476 let p = store
1477 .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
1478 .unwrap();
1479 assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
1480
1481 // email.date → sources/emails/<YYYY>/<MM>/
1482 let p = store
1483 .shard_path_for(
1484 "email",
1485 &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
1486 "e1",
1487 )
1488 .unwrap();
1489 assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
1490
1491 // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
1492 let p = store
1493 .shard_path_for(
1494 "transcript",
1495 &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
1496 "t1",
1497 )
1498 .unwrap();
1499 assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
1500 }
1501
1502 #[test]
1503 fn shard_path_falls_back_to_created() {
1504 let dir = empty_store();
1505 let store = open(&dir);
1506 // meeting with no `date` field but a `created` timestamp.
1507 let p = store
1508 .shard_path_for(
1509 "meeting",
1510 &fm_with_created("2024-07-09T08:30:00-04:00"),
1511 "sync",
1512 )
1513 .unwrap();
1514 assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
1515 }
1516
1517 #[test]
1518 fn shard_path_primary_field_wins_over_created() {
1519 let dir = empty_store();
1520 let store = open(&dir);
1521 let mut fm = fm_with_created("2020-01-01T00:00:00Z");
1522 fm.extra.insert(
1523 "date".into(),
1524 serde_norway::Value::String("2026-05-22".into()),
1525 );
1526 let p = store.shard_path_for("expense", &fm, "x").unwrap();
1527 // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
1528 assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
1529 }
1530
1531 #[test]
1532 fn shard_path_flat_types_have_no_shard_segment() {
1533 let dir = empty_store();
1534 let store = open(&dir);
1535 // A contact has a `created` date, but contacts stay flat.
1536 let p = store
1537 .shard_path_for(
1538 "contact",
1539 &fm_with_created("2026-05-22T00:00:00Z"),
1540 "sarah-chen",
1541 )
1542 .unwrap();
1543 assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
1544
1545 // wiki-page is flat (no date shard) but still files under a type-folder:
1546 // `wiki/topics/<name>.md`, NEVER flat as `wiki/<name>.md`. A 2-component
1547 // path is invisible to the index/validate type-folder model.
1548 let p = store
1549 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1550 .unwrap();
1551 assert_eq!(p, PathBuf::from("wiki/topics/renewal-theme.md"));
1552 }
1553
1554 /// Regression: a wiki-page written through the toolkit's own path
1555 /// computation must land at a path the index + validate type-folder model
1556 /// accepts. `shard_path_for("wiki-page", …)` previously returned a
1557 /// 2-component `wiki/<file>` path, which `type_folder_of` (in both `index`
1558 /// and `validate`) treats as "no type-folder" — so the page either crashed
1559 /// `Index::on_write` (it tried to create `index.md` inside a file) or was
1560 /// silently dropped from every catalog by `Index::rebuild_all`. The
1561 /// computed path must have 3 components: `<layer>/<type-folder>/<file>`.
1562 #[test]
1563 fn shard_path_wiki_page_is_indexable_three_component_path() {
1564 let dir = empty_store();
1565 let store = open(&dir);
1566 let p = store
1567 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1568 .unwrap();
1569 // First two components are a layer + a non-empty type-folder segment;
1570 // the file is the third. This is exactly the shape `type_folder_of`
1571 // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
1572 let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
1573 assert_eq!(
1574 comps.len(),
1575 3,
1576 "wiki-page path must be <layer>/<type-folder>/<file>, got {p:?}"
1577 );
1578 assert_eq!(comps[0], "wiki", "first component must be the wiki layer");
1579 assert!(
1580 !comps[1].is_empty() && comps[1] != "renewal-theme.md",
1581 "second component must be a real type-folder, not the file: {p:?}"
1582 );
1583 assert!(
1584 comps[2].ends_with(".md"),
1585 "third component must be the .md file: {p:?}"
1586 );
1587 }
1588
1589 #[test]
1590 fn shard_path_preserves_and_adds_md_extension() {
1591 let dir = empty_store();
1592 let store = open(&dir);
1593 let with = store
1594 .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
1595 .unwrap();
1596 let without = store
1597 .shard_path_for("contact", &Frontmatter::default(), "sarah")
1598 .unwrap();
1599 assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
1600 assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
1601 }
1602
1603 #[test]
1604 fn shard_path_errors_when_sharding_type_has_no_date() {
1605 let dir = empty_store();
1606 let store = open(&dir);
1607 // expense shards, but no `date` and no `created` → NoShardDate.
1608 let err = store
1609 .shard_path_for("expense", &Frontmatter::default(), "mystery")
1610 .unwrap_err();
1611 match err {
1612 StoreError::NoShardDate { file } => {
1613 assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
1614 }
1615 other => panic!("expected NoShardDate, got {other:?}"),
1616 }
1617 }
1618
1619 // ── find_links_to ────────────────────────────────────────────────────────
1620
1621 #[test]
1622 fn find_links_to_matches_all_accepted_spellings() {
1623 let dir = empty_store();
1624 let root = dir.path();
1625 let target = "records/contacts/sarah-chen";
1626
1627 // Plain link.
1628 write(
1629 root,
1630 "wiki/people/sarah.md",
1631 &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1632 );
1633 // Link with display text.
1634 write(
1635 root,
1636 "records/meetings/2026/05/m.md",
1637 &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
1638 );
1639 // Link with .md extension (accepted, warned by validate).
1640 write(
1641 root,
1642 "wiki/themes/t.md",
1643 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}.md]]\n"),
1644 );
1645 // A catalog/index file also contains the link literally — included.
1646 write(
1647 root,
1648 "records/contacts/index.md",
1649 &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
1650 );
1651 // No link to the target.
1652 write(
1653 root,
1654 "wiki/people/elena.md",
1655 "---\ntype: wiki-page\nsummary: s\n---\nNo links here.\n",
1656 );
1657 // Short-form link must NOT match the full-path target.
1658 write(
1659 root,
1660 "wiki/people/bob.md",
1661 "---\ntype: wiki-page\nsummary: s\n---\n[[sarah-chen]]\n",
1662 );
1663 // A longer path that merely starts with the target must NOT match
1664 // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
1665 write(
1666 root,
1667 "wiki/people/jr.md",
1668 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}-jr]]\n"),
1669 );
1670
1671 let store = open(&dir);
1672 let got = rels(&store.find_links_to(Path::new(target)).unwrap());
1673 assert_eq!(
1674 got,
1675 vec![
1676 "records/contacts/index.md".to_string(),
1677 "records/meetings/2026/05/m.md".to_string(),
1678 "wiki/people/sarah.md".to_string(),
1679 "wiki/themes/t.md".to_string(),
1680 ]
1681 );
1682 }
1683
1684 #[test]
1685 fn find_links_to_distinguishes_sibling_paths() {
1686 // Two contacts whose paths share a prefix; a link to one must not be
1687 // reported as a link to the other.
1688 let dir = empty_store();
1689 let root = dir.path();
1690 write(
1691 root,
1692 "wiki/a.md",
1693 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah]]\n",
1694 );
1695 write(
1696 root,
1697 "wiki/b.md",
1698 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1699 );
1700 let store = open(&dir);
1701
1702 assert_eq!(
1703 rels(
1704 &store
1705 .find_links_to(Path::new("records/contacts/sarah"))
1706 .unwrap()
1707 ),
1708 vec!["wiki/a.md".to_string()]
1709 );
1710 assert_eq!(
1711 rels(
1712 &store
1713 .find_links_to(Path::new("records/contacts/sarah-chen"))
1714 .unwrap()
1715 ),
1716 vec!["wiki/b.md".to_string()]
1717 );
1718 }
1719
1720 // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
1721
1722 /// The working-set validate's incoming-linker discovery runs through
1723 /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
1724 /// batch contract that makes that single-pass behavior correct: the result is
1725 /// the union of incoming linkers across every target, with per-target
1726 /// boundary correctness preserved (no alternation arm bleeds into a
1727 /// prefix-sharing sibling). If a regression reverts the batch finder to a
1728 /// per-object loop, the union below would still hold — but the boundary +
1729 /// union-equivalence assertions are what guard the *correctness* of folding N
1730 /// scans into one regex.
1731 #[test]
1732 fn find_links_to_any_returns_the_union_with_boundary_correctness() {
1733 let dir = empty_store();
1734 let root = dir.path();
1735
1736 // Two distinct targets, each with its own linker.
1737 write(
1738 root,
1739 "wiki/links-sarah.md",
1740 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1741 );
1742 write(
1743 root,
1744 "wiki/links-acme.md",
1745 "---\ntype: wiki-page\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
1746 );
1747 // One file links to BOTH targets — must appear exactly once (deduped),
1748 // proving the per-file early-exit folds multiple-target hits into a
1749 // single result row rather than one row per matched target.
1750 write(
1751 root,
1752 "records/meetings/2026/05/m.md",
1753 "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
1754 [[records/companies/acme]]\n",
1755 );
1756 // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
1757 // be reported as a link to `sarah-chen` even though the alternation now
1758 // carries `sarah-chen` as one arm.
1759 write(
1760 root,
1761 "wiki/links-jr.md",
1762 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
1763 );
1764 // A file that links to neither requested target.
1765 write(
1766 root,
1767 "wiki/unrelated.md",
1768 "---\ntype: wiki-page\nsummary: s\n---\n[[wiki/themes/spend]]\n",
1769 );
1770
1771 let store = open(&dir);
1772 let targets = vec![
1773 PathBuf::from("records/contacts/sarah-chen"),
1774 PathBuf::from("records/companies/acme"),
1775 ];
1776
1777 let got = rels(&store.find_links_to_any(&targets).unwrap());
1778 assert_eq!(
1779 got,
1780 vec![
1781 "records/meetings/2026/05/m.md".to_string(),
1782 "wiki/links-acme.md".to_string(),
1783 "wiki/links-sarah.md".to_string(),
1784 ],
1785 "batch finder must return the deduped union of linkers across all \
1786 targets, excluding the prefix-sibling and the unrelated file"
1787 );
1788
1789 // Equivalence: the batch result must equal the union of the per-target
1790 // single finder. This is the property the working-set path relies on
1791 // when it folds one-scan-per-object into one scan for the whole set.
1792 let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
1793 for t in &targets {
1794 for linker in store.find_links_to(t).unwrap() {
1795 union.insert(linker);
1796 }
1797 }
1798 assert_eq!(
1799 rels(&union.into_iter().collect::<Vec<_>>()),
1800 got,
1801 "find_links_to_any must equal the union of per-target find_links_to"
1802 );
1803 }
1804
1805 /// An empty target set must scan nothing and find nothing — and crucially
1806 /// must NOT compile to a match-everything empty regex (which would report
1807 /// every `.md` as a linker). This is the empty-working-set fast path the
1808 /// `validate` loop hits when nothing changed.
1809 #[test]
1810 fn find_links_to_any_empty_targets_matches_nothing() {
1811 let dir = empty_store();
1812 let root = dir.path();
1813 write(
1814 root,
1815 "wiki/a.md",
1816 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1817 );
1818 let store = open(&dir);
1819
1820 assert!(
1821 store.find_links_to_any(&[]).unwrap().is_empty(),
1822 "no targets ⇒ no linkers (an empty pattern must not match every file)"
1823 );
1824 // A set of only empty/non-link targets is likewise a no-op, not a
1825 // match-everything.
1826 assert!(
1827 store
1828 .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
1829 .unwrap()
1830 .is_empty(),
1831 "targets that render to empty link text contribute no alternation arm"
1832 );
1833 }
1834
1835 // ── read_type_index ──────────────────────────────────────────────────────
1836
1837 #[test]
1838 fn read_type_index_parses_records_and_flattens_fields() {
1839 let dir = empty_store();
1840 let root = dir.path();
1841 let jsonl = "\
1842{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
1843{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
1844";
1845 let p = write(root, "records/expenses/index.jsonl", jsonl);
1846 let store = open(&dir);
1847 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1848
1849 assert_eq!(recs.len(), 2);
1850 // Sorted by path asc.
1851 assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
1852 assert_eq!(recs[0].type_, "expense");
1853 assert_eq!(recs[0].summary, "lunch");
1854 assert_eq!(recs[0].tags, vec!["meals".to_string()]);
1855 assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
1856 assert!(recs[0].created.is_some());
1857 // Extra (non-typed) frontmatter flattens into `fields`.
1858 assert_eq!(
1859 recs[0].fields.get("vendor"),
1860 Some(&serde_json::json!("acme"))
1861 );
1862 assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
1863 // Defaults: missing tags/links → empty.
1864 assert!(recs[1].tags.is_empty());
1865 assert!(recs[1].links.is_empty());
1866 }
1867
1868 #[test]
1869 fn read_type_index_last_write_wins_and_skips_blanks() {
1870 let dir = empty_store();
1871 let root = dir.path();
1872 // Same path twice; the second line supersedes the first. A blank line
1873 // in between must be ignored, not error.
1874 let jsonl = "\
1875{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
1876
1877{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
1878";
1879 let p = write(root, "records/contacts/index.jsonl", jsonl);
1880 let store = open(&dir);
1881 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1882 assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
1883 assert_eq!(recs[0].summary, "new", "later line must win");
1884 }
1885
1886 #[test]
1887 fn read_type_index_errors_on_malformed_line() {
1888 let dir = empty_store();
1889 let root = dir.path();
1890 let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
1891 let store = open(&dir);
1892 let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
1893 assert!(matches!(err, StoreError::BadTypeIndex { .. }));
1894 }
1895
1896 // ── find_by_type / find_by_where ─────────────────────────────────────────
1897
1898 fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
1899 format!(
1900 "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
1901 )
1902 }
1903
1904 #[test]
1905 fn find_by_type_reads_canonical_folder_sidecar() {
1906 let dir = empty_store();
1907 let root = dir.path();
1908 // Canonical folder for `contact` is records/contacts.
1909 write(
1910 root,
1911 "records/contacts/index.jsonl",
1912 &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
1913 + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
1914 );
1915 // A different type's sidecar must not leak into a contact query.
1916 write(
1917 root,
1918 "records/companies/index.jsonl",
1919 &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
1920 );
1921 let store = open(&dir);
1922 let recs = store.find_by_type("contact").unwrap();
1923 let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
1924 assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
1925 assert!(recs.iter().all(|r| r.type_ == "contact"));
1926 }
1927
1928 #[test]
1929 fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
1930 let dir = empty_store();
1931 let root = dir.path();
1932 // A custom `proposal` record filed under a non-canonical folder NAME
1933 // (the natural plural `records/proposals/`) inside the records layer.
1934 // `default_type_folder("proposal")` = `records/proposal` (bare type, no
1935 // pluralization guess), so the canonical sidecar does not exist and
1936 // `find_by_type` falls back. The fallback is bounded to the type's
1937 // layer (records), so this record — same layer, non-canonical folder —
1938 // is still found: completeness within the layer holds.
1939 write(
1940 root,
1941 "records/proposals/index.jsonl",
1942 &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
1943 );
1944 // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
1945 // old whole-store fallback read every sidecar in the store and would
1946 // have leaked this into the result; the layer-bounded fallback must not.
1947 // It also pins that the fallback is O(entities-in-layer), never O(store).
1948 write(
1949 root,
1950 "sources/proposals/index.jsonl",
1951 &jsonl_line(
1952 "sources/proposals/leak.md",
1953 "proposal",
1954 "cross-layer decoy",
1955 "",
1956 ),
1957 );
1958 let store = open(&dir);
1959 let recs = store.find_by_type("proposal").unwrap();
1960 assert_eq!(
1961 recs.len(),
1962 1,
1963 "only the records-layer proposal, not the sources decoy"
1964 );
1965 assert_eq!(recs[0].summary, "Q3 proposal");
1966 assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
1967 }
1968
1969 #[test]
1970 fn find_by_type_canonical_absent_does_not_read_other_layers() {
1971 let dir = empty_store();
1972 let root = dir.path();
1973 // `email`'s canonical folder is `sources/emails` (layer Sources). No
1974 // sidecar there yet, so `find_by_type("email")` falls back — but only
1975 // within the Sources layer. A populated sidecar in the Records layer
1976 // must never be touched: the fallback is layer-bounded, not store-wide.
1977 // Under the old `read_all_type_indexes_in(None)` fallback this records
1978 // sidecar would have been read and filtered (wasted O(store) I/O); now
1979 // it is outside the walk root entirely.
1980 write(
1981 root,
1982 "records/contacts/index.jsonl",
1983 &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
1984 );
1985 let store = open(&dir);
1986 // No email anywhere ⇒ empty, and the records layer was not in scope.
1987 assert!(store.find_by_type("email").unwrap().is_empty());
1988 }
1989
1990 #[test]
1991 fn find_by_where_matches_typed_columns_and_flat_fields() {
1992 let dir = empty_store();
1993 let root = dir.path();
1994 write(
1995 root,
1996 "records/expenses/index.jsonl",
1997 &(jsonl_line(
1998 "records/expenses/a.md",
1999 "expense",
2000 "lunch",
2001 ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
2002 ) + &jsonl_line(
2003 "records/expenses/b.md",
2004 "expense",
2005 "taxi",
2006 ",\"vendor\":\"yellow\"",
2007 )),
2008 );
2009 write(
2010 root,
2011 "records/contacts/index.jsonl",
2012 &jsonl_line(
2013 "records/contacts/sarah.md",
2014 "contact",
2015 "Sarah",
2016 ",\"tags\":[\"customer\"]",
2017 ),
2018 );
2019 let store = open(&dir);
2020
2021 // Flat field in `fields`.
2022 let by_vendor = store.find_by_where("vendor", "acme").unwrap();
2023 assert_eq!(by_vendor.len(), 1);
2024 assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
2025
2026 // Typed column: type (spans both expense records).
2027 assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
2028
2029 // Typed list column: tags membership.
2030 let customers = store.find_by_where("tags", "customer").unwrap();
2031 assert_eq!(customers.len(), 1);
2032 assert_eq!(
2033 customers[0].path,
2034 PathBuf::from("records/contacts/sarah.md")
2035 );
2036
2037 // No match → empty.
2038 assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2039 }
2040
2041 #[test]
2042 fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2043 let dir = empty_store();
2044 let root = dir.path();
2045 // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2046 // serialized from such a file preserves it verbatim.
2047 write(
2048 root,
2049 "records/meetings/index.jsonl",
2050 "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2051\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2052\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2053 );
2054 let store = open(&dir);
2055
2056 // The exact value an agent reads out of the file (`Z` form) must match.
2057 let by_z = store
2058 .find_by_where("created", "2026-05-01T00:00:00Z")
2059 .unwrap();
2060 assert_eq!(by_z.len(), 1);
2061 assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2062
2063 // The equivalent explicit-offset spelling of the same instant matches too.
2064 assert_eq!(
2065 store
2066 .find_by_where("created", "2026-05-01T00:00:00+00:00")
2067 .unwrap()
2068 .len(),
2069 1
2070 );
2071
2072 // A non-UTC stored value matches both its own offset spelling and the
2073 // same instant expressed as `Z` (instant comparison, not string compare).
2074 assert_eq!(
2075 store
2076 .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2077 .unwrap()
2078 .len(),
2079 1
2080 );
2081 assert_eq!(
2082 store
2083 .find_by_where("updated", "2026-05-02T16:30:00Z")
2084 .unwrap()
2085 .len(),
2086 1
2087 );
2088
2089 // A different instant does not match.
2090 assert!(store
2091 .find_by_where("created", "2026-05-01T00:00:01Z")
2092 .unwrap()
2093 .is_empty());
2094 // A non-RFC3339 query value never matches a real timestamp.
2095 assert!(store
2096 .find_by_where("created", "2026-05-01")
2097 .unwrap()
2098 .is_empty());
2099 }
2100
2101 #[test]
2102 fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
2103 // The O(entities-in-layer) contract: a layer-scoped where read must walk
2104 // ONLY the named layer's subtree. Proven structurally — a *malformed*
2105 // sidecar in another layer would make `read_type_index` error if it were
2106 // read, so a scoped read that succeeds (and excludes that record) is
2107 // proof the other layer's I/O never happened.
2108 let dir = empty_store();
2109 let root = dir.path();
2110 write(
2111 root,
2112 "records/companies/index.jsonl",
2113 &jsonl_line(
2114 "records/companies/acme.md",
2115 "company",
2116 "Acme",
2117 ",\"domain\":\"acme.com\"",
2118 ),
2119 );
2120 // Same field/value in the sources layer — but the sidecar is corrupt.
2121 write(
2122 root,
2123 "sources/emails/index.jsonl",
2124 "{ this is not valid json and would error if read }\n",
2125 );
2126 let store = open(&dir);
2127
2128 // Scoped to records: the corrupt sources sidecar is out of scope, so the
2129 // read succeeds and returns only the records-layer match.
2130 let in_records = store
2131 .find_by_where_in("domain", "acme.com", Some(Layer::Records))
2132 .expect("a records-scoped read must not touch the sources sidecar");
2133 assert_eq!(
2134 rels(
2135 &in_records
2136 .iter()
2137 .map(|r| r.path.clone())
2138 .collect::<Vec<_>>()
2139 ),
2140 vec!["records/companies/acme.md".to_string()]
2141 );
2142
2143 // The store-wide read DOES reach the corrupt sidecar and surfaces it as
2144 // a parse error — confirming the corrupt file is genuinely in the tree
2145 // and that only the layer scope spares it.
2146 let store_wide = store.find_by_where("domain", "acme.com");
2147 assert!(
2148 matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
2149 "unscoped read walks every layer and hits the corrupt sidecar"
2150 );
2151
2152 // Scoping to the layer that holds only the corrupt sidecar still errors
2153 // (the scope includes it), proving the scope is a real subtree bound and
2154 // not a silent "skip anything that fails".
2155 let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
2156 assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
2157 }
2158
2159 #[test]
2160 fn find_by_where_in_missing_layer_is_empty_not_an_error() {
2161 // A layer-scoped read over a layer folder that does not exist yet must
2162 // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
2163 // error from `ignore` over a nonexistent path.
2164 let dir = empty_store();
2165 let root = dir.path();
2166 write(
2167 root,
2168 "records/contacts/index.jsonl",
2169 &jsonl_line(
2170 "records/contacts/sarah.md",
2171 "contact",
2172 "Sarah",
2173 ",\"city\":\"denver\"",
2174 ),
2175 );
2176 let store = open(&dir);
2177
2178 // `wiki/` was never created.
2179 let in_wiki = store
2180 .find_by_where_in("city", "denver", Some(Layer::Wiki))
2181 .expect("missing layer subtree is empty, not an error");
2182 assert!(in_wiki.is_empty());
2183
2184 // Same query scoped to the layer that has the record still finds it.
2185 let in_records = store
2186 .find_by_where_in("city", "denver", Some(Layer::Records))
2187 .unwrap();
2188 assert_eq!(in_records.len(), 1);
2189 }
2190
2191 // ── abs_path / rel_path ──────────────────────────────────────────────────
2192
2193 #[test]
2194 fn abs_and_rel_path_roundtrip() {
2195 let dir = empty_store();
2196 let store = open(&dir);
2197 let rel = Path::new("records/contacts/sarah.md");
2198 let abs = store.abs_path(rel);
2199 assert_eq!(abs, dir.path().join(rel));
2200 assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
2201
2202 // An absolute path is passed through unchanged by abs_path.
2203 assert_eq!(store.abs_path(&abs), abs);
2204
2205 // A path outside the store has no store-relative form.
2206 assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
2207 }
2208
2209 // ── infer_type_from_path (inverse of default_type_folder) ────────────────
2210
2211 #[test]
2212 fn infer_type_maps_every_recognized_folder_back_to_its_type() {
2213 let cases = [
2214 ("sources/emails/x.md", "email"),
2215 ("sources/transcripts/x.md", "transcript"),
2216 ("sources/docs/x.md", "pdf-source"),
2217 ("records/contacts/x.md", "contact"),
2218 ("records/companies/x.md", "company"),
2219 ("records/expenses/x.md", "expense"),
2220 ("records/meetings/x.md", "meeting"),
2221 ("records/decisions/x.md", "decision"),
2222 ("records/invoices/x.md", "invoice"),
2223 // Any wiki sub-folder infers `wiki-page` regardless of the topic name.
2224 ("wiki/topics/x.md", "wiki-page"),
2225 ("wiki/pricing/x.md", "wiki-page"),
2226 ];
2227 for (path, expected) in cases {
2228 assert_eq!(
2229 infer_type_from_path(Path::new(path)).as_deref(),
2230 Some(expected),
2231 "path {path} should infer type {expected}"
2232 );
2233 }
2234 }
2235
2236 #[test]
2237 fn infer_type_round_trips_with_default_type_folder() {
2238 // The canonical invariant: inference is the inverse of the forward map.
2239 // Every recognized type, routed through `default_type_folder` and then
2240 // back through `infer_type_from_path`, must return the original type.
2241 // `wiki-page` is the one many-to-one case (every topic folder maps back
2242 // to `wiki-page`), so its forward folder still round-trips.
2243 let recognized = [
2244 "email",
2245 "transcript",
2246 "pdf-source",
2247 "contact",
2248 "company",
2249 "expense",
2250 "meeting",
2251 "decision",
2252 "invoice",
2253 "wiki-page",
2254 ];
2255 for type_ in recognized {
2256 let folder = default_type_folder(type_);
2257 let file = folder.join("x.md");
2258 assert_eq!(
2259 infer_type_from_path(&file).as_deref(),
2260 Some(type_),
2261 "recognized type {type_} (folder {folder:?}) must round-trip"
2262 );
2263 }
2264 }
2265
2266 #[test]
2267 fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
2268 // Regression guard for the CLI/core divergence: `default_type_folder`'s
2269 // unrecognized fallback is the BARE type name (`task → records/task`,
2270 // `tasks → records/tasks`). Inference must NOT singularize, or a custom
2271 // type would not round-trip (e.g. `records/tasks` → `task` would clash
2272 // with `default_type_folder("task") → records/task`).
2273 for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
2274 let folder = default_type_folder(custom);
2275 assert_eq!(folder, PathBuf::from("records").join(custom));
2276 let file = folder.join("x.md");
2277 assert_eq!(
2278 infer_type_from_path(&file).as_deref(),
2279 Some(custom),
2280 "custom type {custom} must round-trip verbatim (no singularization)"
2281 );
2282 }
2283
2284 // The specific case named in the finding: a plural custom folder keeps
2285 // its trailing `s`; it is NOT singularized to `task`.
2286 assert_eq!(
2287 infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
2288 Some("tasks"),
2289 "records/tasks must infer `tasks`, not `task`"
2290 );
2291 }
2292
2293 #[test]
2294 fn infer_type_requires_three_component_layer_folder_file_shape() {
2295 // Fewer than 3 components: a file directly under a layer has no
2296 // type-folder, so inference yields None (matches the old CLI contract).
2297 assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
2298 assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
2299 assert_eq!(infer_type_from_path(Path::new("wiki/x.md")), None);
2300 assert_eq!(infer_type_from_path(Path::new("x.md")), None);
2301 // Unknown leading layer is never inferred.
2302 assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
2303 // Deeper paths still infer from the first type-folder segment (e.g. a
2304 // sharded record under records/expenses/2026/05/x.md).
2305 assert_eq!(
2306 infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
2307 Some("expense"),
2308 );
2309 }
2310}