dbmd_core/store.rs
1//! `store` — walk, locate, and shard a db.md store.
2//!
3//! A db.md store is one directory marked by an uppercase `DB.md` at its root.
4//! [`Store::open`] is the single gate every store-walking subcommand goes
5//! through; a missing `DB.md` is the [`NotAStore`] error (`NOT_A_STORE`). The
6//! toolkit never guesses a store root.
7//!
8//! Scale discipline lives here: [`Store::walk`] and the layer/type-folder
9//! walks are **SWEEP** primitives used only by `validate --all`,
10//! `index rebuild`, and `stats`. The interactive loop instead uses
11//! [`Store::find_links_to`] / [`Store::find_links_to_any`] (embedded ripgrep,
12//! presence-only) and the `index.jsonl` sidecar readers
13//! ([`Store::find_by_type`] / [`Store::find_by_where`] /
14//! [`Store::read_type_index`]) — never a whole-store parse. The batch
15//! [`Store::find_links_to_any`] is what keeps the working-set validate's
16//! incoming-linker discovery a single store scan rather than one scan per
17//! changed object.
18
19use std::collections::BTreeMap;
20use std::path::{Path, PathBuf};
21
22use chrono::{DateTime, Datelike, FixedOffset};
23use grep::regex::RegexMatcher;
24use grep::searcher::sinks::UTF8;
25use grep::searcher::Searcher;
26use ignore::WalkBuilder;
27
28use crate::index::IndexRecord;
29use crate::parser::{parse_db_md, Config, Frontmatter};
30
31/// Basenames that are never content files: the config marker and the two
32/// curator-maintained catalogs. The store walks skip these so a SWEEP over the
33/// content layers never mistakes a catalog for a record.
34const NON_CONTENT_BASENAMES: [&str; 3] = ["DB.md", "index.md", "log.md"];
35
36/// The complete machine-twin sidecar that backs every structured read.
37const TYPE_INDEX_FILE: &str = "index.jsonl";
38
39/// Returned when a path is opened as a store but has no `DB.md` at its root.
40/// Surfaced as the structured code `NOT_A_STORE` with a non-zero exit.
41#[derive(Debug, thiserror::Error)]
42#[error("not a db.md store: {path} has no DB.md")]
43pub struct NotAStore {
44 /// The path that was inspected.
45 pub path: PathBuf,
46}
47
48/// Errors from store-level operations (walk, locate, shard, sidecar read).
49#[derive(Debug, thiserror::Error)]
50pub enum StoreError {
51 /// A sidecar `index.jsonl` could not be read or parsed.
52 #[error("failed to read type index {path}: {message}")]
53 BadTypeIndex {
54 /// The sidecar file.
55 path: PathBuf,
56 /// What went wrong.
57 message: String,
58 },
59
60 /// A required date field for sharding was absent or unparseable, and there
61 /// was no usable fallback.
62 #[error("cannot compute shard path for {file}: no usable date field")]
63 NoShardDate {
64 /// The file being placed.
65 file: PathBuf,
66 },
67
68 /// An embedded-ripgrep scan failed to start or run.
69 #[error("search failed under {root}: {message}")]
70 Search {
71 /// The root the scan ran under.
72 root: PathBuf,
73 /// What went wrong.
74 message: String,
75 },
76
77 /// An underlying I/O failure.
78 #[error(transparent)]
79 Io(#[from] std::io::Error),
80}
81
82/// The three canonical layers of a db.md store.
83///
84/// `Ord`/`PartialOrd` are derived (additively) because sibling modules key
85/// `BTreeMap`s on `Layer` (e.g. `stats::Stats::files_per_layer`); the canonical
86/// declaration order (`Sources` < `Records` < `Wiki`) is the sort order.
87#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
88pub enum Layer {
89 /// `sources/` — raw evidence; immutable; date-sharded at scale.
90 Sources,
91 /// `records/` — atomic typed data; entity types flat, event types sharded.
92 Records,
93 /// `wiki/` — curator-synthesized narrative; flat.
94 Wiki,
95}
96
97impl Layer {
98 /// The on-disk folder name for this layer (`"sources"` / `"records"` /
99 /// `"wiki"`).
100 pub fn dir_name(self) -> &'static str {
101 match self {
102 Layer::Sources => "sources",
103 Layer::Records => "records",
104 Layer::Wiki => "wiki",
105 }
106 }
107
108 /// Parse a layer from its folder name; `None` for anything else.
109 pub fn from_dir_name(name: &str) -> Option<Self> {
110 match name {
111 "sources" => Some(Layer::Sources),
112 "records" => Some(Layer::Records),
113 "wiki" => Some(Layer::Wiki),
114 _ => None,
115 }
116 }
117
118 /// Every layer, in canonical order.
119 pub fn all() -> [Layer; 3] {
120 [Layer::Sources, Layer::Records, Layer::Wiki]
121 }
122}
123
124/// An opened db.md store: its root path plus the parsed `DB.md` [`Config`].
125///
126/// Construct via [`Store::open`]; that is the only path in, and it validates
127/// the `DB.md` marker so downstream code can assume a real store.
128#[derive(Debug, Clone)]
129pub struct Store {
130 /// The store root (the directory containing `DB.md`).
131 pub root: PathBuf,
132 /// The parsed `DB.md` config (agent instructions, policies, schemas).
133 pub config: Config,
134}
135
136impl Store {
137 /// True if `path` is a db.md store root: an uppercase `DB.md` file exists
138 /// at `path`. On case-sensitive filesystems a lowercase `db.md` must NOT
139 /// count (the lowercase name refers to the project/spec, not the marker).
140 pub fn is_db_md_store(path: &Path) -> bool {
141 // Read the directory and match the *stored* filename byte-for-byte.
142 // `path.join("DB.md").exists()` would lie on a case-insensitive
143 // filesystem (macOS default), where a lowercase `db.md` answers a
144 // `DB.md` probe. `read_dir` returns the real on-disk name, so the
145 // exact-match check is correct on both case-sensitive (Linux) and
146 // case-insensitive filesystems.
147 let entries = match std::fs::read_dir(path) {
148 Ok(entries) => entries,
149 Err(_) => return false,
150 };
151 for entry in entries.flatten() {
152 if entry.file_name() == "DB.md" {
153 // A directory literally named `DB.md` is not the marker.
154 match entry.file_type() {
155 Ok(ft) if ft.is_dir() => return false,
156 Ok(_) => return true,
157 Err(_) => return false,
158 }
159 }
160 }
161 false
162 }
163
164 /// Open `path` as a db.md store: confirm the `DB.md` marker (else
165 /// [`NotAStore`]) and parse the `DB.md` config. Every store-walking
166 /// subcommand opens through here.
167 pub fn open(path: &Path) -> Result<Store, NotAStore> {
168 if !Store::is_db_md_store(path) {
169 return Err(NotAStore {
170 path: path.to_path_buf(),
171 });
172 }
173 let db_md = path.join("DB.md");
174 // The marker exists; parse its config. A read or parse failure leaves
175 // the store openable with default config rather than masquerading as
176 // NOT_A_STORE — the marker is present, so this *is* a store; a damaged
177 // DB.md is `dbmd validate`'s job to report, not `open`'s.
178 let config = match std::fs::read_to_string(&db_md) {
179 Ok(text) => parse_db_md(&text, &db_md).unwrap_or_default(),
180 Err(_) => Config::default(),
181 };
182 Ok(Store {
183 root: path.to_path_buf(),
184 config,
185 })
186 }
187
188 /// **SWEEP.** Recursively iterate every `.md` content file across
189 /// `sources/`, `records/`, and `wiki/`, skipping hidden dirs and `log/`.
190 /// Used only by `validate --all`, `index rebuild`, and `stats` — never on
191 /// the interactive loop.
192 pub fn walk(&self) -> Result<Vec<PathBuf>, StoreError> {
193 // Only the three content layers — never root meta files (`DB.md`,
194 // `index.md`, `log.md`) and never `log/`, which live at root and are
195 // outside every layer dir.
196 let mut out = Vec::new();
197 for layer in Layer::all() {
198 out.extend(self.walk_layer(layer)?);
199 }
200 out.sort();
201 Ok(out)
202 }
203
204 /// **SWEEP.** Like [`Store::walk`] but scoped to a single layer.
205 pub fn walk_layer(&self, layer: Layer) -> Result<Vec<PathBuf>, StoreError> {
206 let layer_root = self.root.join(layer.dir_name());
207 if !layer_root.is_dir() {
208 return Ok(Vec::new());
209 }
210 self.walk_content_md(&layer_root)
211 }
212
213 /// Enumerate every `.md` file in a single type-folder, **recursing through
214 /// its date-shards** (`sources/emails/**/*.md`). The unit the index builder
215 /// and per-folder rebuild operate on. SWEEP-class (scoped to one folder).
216 pub fn walk_type_folder(&self, type_folder: &Path) -> Result<Vec<PathBuf>, StoreError> {
217 let abs = self.resolve_under_root(type_folder);
218 if !abs.is_dir() {
219 return Ok(Vec::new());
220 }
221 self.walk_content_md(&abs)
222 }
223
224 /// The ≤`n` most-recent files in a type-folder by frontmatter `updated`
225 /// (descending), ties broken by store-relative path (ascending) — a total
226 /// order, so write-through and rebuild never disagree on #500 vs #501.
227 ///
228 /// Reads `updated` across the folder's shards — a SWEEP cost absorbed into
229 /// `index rebuild`. The write-through path never calls this. The
230 /// cap-selection primitive for the 500-entry `index.md` browse view.
231 pub fn recent_in_type_folder(
232 &self,
233 type_folder: &Path,
234 n: usize,
235 ) -> Result<Vec<PathBuf>, StoreError> {
236 let files = self.walk_type_folder(type_folder)?;
237 // (updated, rel-path) for each file. Files missing/unparseable
238 // `updated` sort *after* dated ones (None last), then by path — so they
239 // are deterministically the lowest-priority candidates for the cap, not
240 // dropped silently. The total order (updated desc, path asc) is what
241 // keeps write-through and rebuild agreeing on #500 vs #501.
242 let mut keyed: Vec<(Option<DateTime<FixedOffset>>, PathBuf)> = files
243 .into_iter()
244 .map(|rel| {
245 let updated = self.read_updated(&self.abs_path(&rel));
246 (updated, rel)
247 })
248 .collect();
249 keyed.sort_by(|a, b| {
250 // `updated` descending: newest first. `None` is treated as the
251 // oldest possible, so dated files always win a cap slot over
252 // undated ones.
253 let by_updated = b.0.cmp(&a.0);
254 by_updated.then_with(|| a.1.cmp(&b.1))
255 });
256 keyed.truncate(n);
257 Ok(keyed.into_iter().map(|(_, rel)| rel).collect())
258 }
259
260 /// The shard/flat predicate: true if the type date-shards, false if it
261 /// stays flat. True for source types and event record types
262 /// (`expense`/`invoice`/`meeting` + custom `order`/`ticket`/`transaction`),
263 /// or when `DB.md ## Schemas` declares `shard: by-date`. False for
264 /// dedup-bounded entity types (`contact`/`company`/`decision`) and `wiki/`.
265 pub fn type_shards(&self, type_: &str) -> bool {
266 // Built-in classification. Sharding is a property of the *type*:
267 // - source types carry a primary date field and shard;
268 // - event record types track business volume and shard;
269 // - dedup-bounded entity types and curation-bounded wiki stay flat.
270 // NOTE: the SPEC's `DB.md ## Schemas` `shard: by-date` override has no
271 // representation in the frozen `Schema`/`FieldSpec` types (no shard
272 // flag), so it cannot be consulted here yet — see the store findings.
273 matches!(
274 type_,
275 // source types
276 "email" | "transcript" | "pdf-source"
277 // event record types (canonical)
278 | "expense" | "invoice" | "meeting"
279 // event record types (recognized custom, per the plan)
280 | "order" | "ticket" | "transaction"
281 )
282 }
283
284 /// Compute the canonical write path for a new file. For a sharding type
285 /// (per [`Store::type_shards`]) insert `<YYYY>/<MM>/` from the type's
286 /// primary date field (`email.date`, `expense.date`, … fallback `created`)
287 /// under the type folder; flat types and `wiki/` get no shard segment.
288 /// Deterministic + stable: same input → same path, so a record never moves
289 /// once written.
290 pub fn shard_path_for(
291 &self,
292 type_: &str,
293 frontmatter: &Frontmatter,
294 name: &str,
295 ) -> Result<PathBuf, StoreError> {
296 self.shard_path_in(&default_type_folder(type_), type_, frontmatter, name)
297 }
298
299 /// Like [`Store::shard_path_for`], but compute the path under an explicit,
300 /// caller-resolved type-folder rather than the canonical default. This lets a
301 /// write surface honour an agent-supplied conforming sub-folder — e.g.
302 /// `wiki/projects/`, `wiki/people/`, `wiki/synthesis/` (the SPEC files a
303 /// `wiki-page` under `wiki/<topic>/`, i.e. ANY topic sub-folder, not only the
304 /// `wiki/topics` default) — while still applying date-sharding for sharding
305 /// types. The folder must be a conforming `<layer>/<type-folder>` (2
306 /// components, recognized layer); the caller is responsible for that (see the
307 /// CLI's `resolve_write_path`), so it is taken as given here.
308 ///
309 /// Sharding is still a property of the *type*: a sharding type gets the
310 /// `<YYYY>/<MM>` segment under `folder`; a flat type lands directly in it.
311 pub fn shard_path_in(
312 &self,
313 folder: &Path,
314 type_: &str,
315 frontmatter: &Frontmatter,
316 name: &str,
317 ) -> Result<PathBuf, StoreError> {
318 let folder = folder.to_path_buf();
319 let filename = ensure_md_extension(name);
320
321 if !self.type_shards(type_) {
322 // Flat type (entity records, wiki, decisions): no shard segment.
323 return Ok(folder.join(filename));
324 }
325
326 // Sharding type: derive <YYYY>/<MM> from the primary date field, with
327 // `created` as the universal fallback. Reading the public `Frontmatter`
328 // fields directly (typed `created`/`updated` + raw `extra`) avoids the
329 // not-yet-implemented `Frontmatter::get`/`parse` and keeps this pure.
330 let (year, month) = self
331 .primary_shard_segment(type_, frontmatter)
332 .ok_or_else(|| StoreError::NoShardDate {
333 file: folder.join(&filename),
334 })?;
335
336 Ok(folder.join(year).join(month).join(filename))
337 }
338
339 /// Find files with an incoming wiki-link to `target`, via **embedded
340 /// ripgrep** for `[[target]]` across all layers. Loop-fast; no whole-graph
341 /// build. Returns store-relative paths.
342 pub fn find_links_to(&self, target: &Path) -> Result<Vec<PathBuf>, StoreError> {
343 // A single target is just the degenerate batch case — one alternation
344 // arm, one store scan. Routing through `find_links_to_any` keeps the
345 // pattern construction and the scan loop in exactly one place. The
346 // batch API takes `&[PathBuf]`, so the one-element slice is owned (a
347 // single alloc on this single-target convenience path; the batch path
348 // validate.rs rides is untouched).
349 self.find_links_to_any(&[target.to_path_buf()])
350 }
351
352 /// Find every file with an incoming wiki-link to **any** of `targets`, in a
353 /// **single embedded-ripgrep pass** over the store (one `.md` walk, one
354 /// presence-only scan per file). This is the batch incoming-linker finder the
355 /// working-set [`crate::validate::validate_working_set`] sits on: it must find
356 /// the linkers for the *whole* changed set without paying a full store read
357 /// per changed object. Cost is therefore one store scan (O(store)), NOT
358 /// `targets.len() × store` — calling [`find_links_to`](Self::find_links_to)
359 /// in a loop would reread every `.md` once per target and is the exact
360 /// `O(changed × store)` blow-up this method exists to prevent. Returns
361 /// store-relative paths (deduped, sorted).
362 ///
363 /// Why content scan and not the sidecar `links` field: the sidecar projects
364 /// only the frontmatter `links:` array, so it misses edges written in the
365 /// body or in typed fields (`company: [[…]]`). Finding an incoming link to an
366 /// arbitrary path therefore requires reading file content — the same reason
367 /// the single-target finder uses ripgrep.
368 pub fn find_links_to_any(&self, targets: &[PathBuf]) -> Result<Vec<PathBuf>, StoreError> {
369 // The wiki-link doctrine: a link is the full store-relative path, no
370 // `.md` extension. A reference to a target therefore appears literally
371 // as `[[<target>]]`, optionally with a `|display` suffix and (warned
372 // but accepted) a trailing `.md`. Build ONE regex that matches all
373 // accepted spellings of an incoming link to ANY target, escaping each
374 // target so path separators / dots stay literal and the alternation
375 // arms keep their boundaries (a link to `sarah` never matches
376 // `sarah-chen`).
377 let mut arms: Vec<String> = Vec::new();
378 for target in targets {
379 let target_str = path_to_link_str(target);
380 if target_str.is_empty() {
381 continue;
382 }
383 // [[ <target> (.md)? ( | display )? ]]
384 arms.push(format!(
385 r"\[\[{}(\.md)?(\|[^\]]*)?\]\]",
386 regex::escape(&target_str)
387 ));
388 }
389 // No usable targets → no possible incoming links, and an empty pattern
390 // would compile to a match-everything regex. Short-circuit instead.
391 if arms.is_empty() {
392 return Ok(Vec::new());
393 }
394 let pattern = arms.join("|");
395
396 let matcher = RegexMatcher::new(&pattern).map_err(|e| StoreError::Search {
397 root: self.root.clone(),
398 message: format!("invalid backlink pattern: {e}"),
399 })?;
400
401 let mut hits = std::collections::BTreeSet::new();
402 // Scan every `.md` file in the store (skip hidden + `log/`), including
403 // `index.md` catalogs — an incoming reference is wherever the literal
404 // link text lives; the caller decides relevance. ONE walk for the whole
405 // target set; per file we stop at the first hit (presence is all we
406 // need), so a file that links to several targets is read once, not once
407 // per target.
408 for rel in self.walk_all_md()? {
409 let abs = self.abs_path(&rel);
410 let mut matched_here = false;
411 let mut searcher = Searcher::new();
412 let res = searcher.search_path(
413 &matcher,
414 &abs,
415 UTF8(|_lnum, _line| {
416 matched_here = true;
417 // Stop at the first hit: presence is all we need.
418 Ok(false)
419 }),
420 );
421 if let Err(e) = res {
422 return Err(StoreError::Search {
423 root: self.root.clone(),
424 message: format!("search failed in {}: {e}", abs.display()),
425 });
426 }
427 if matched_here {
428 hits.insert(rel);
429 }
430 }
431 Ok(hits.into_iter().collect())
432 }
433
434 /// Candidate set for a `type` query: read the relevant type-folder
435 /// `index.jsonl` sidecar(s) and return their records. Complete and
436 /// cold-cache-proof — NOT a walk-and-parse or a frontmatter ripgrep scan,
437 /// and **never a store-wide read**. The common path is one sequential read
438 /// of the canonical type-folder sidecar (O(entities)); when that sidecar is
439 /// absent the read is bounded to the type's single layer subtree
440 /// (O(entities-in-layer)), so a `--type proposal` query before that folder
441 /// has been indexed still stays inside the interactive loop's O(entities)
442 /// contract instead of fanning out across every sidecar in the store.
443 pub fn find_by_type(&self, type_: &str) -> Result<Vec<IndexRecord>, StoreError> {
444 // Read the type's canonical-folder sidecar when it exists (the common,
445 // O(entities) path). Otherwise fall back to the sidecars of the *one
446 // layer* the type belongs to and filter by `type` — complete for records
447 // filed under a non-canonical folder name within that layer (e.g. a
448 // custom `proposal` filed in `records/proposals/` when the canonical
449 // guess is the bare `records/proposal/`), without the whole-store
450 // sidecar fan-out that would break the interactive loop's O(entities)
451 // contract. A type lives in exactly one layer, and `default_type_folder`
452 // always encodes it (recognized → its SPEC layer; unrecognized →
453 // `records/`), so the fallback walk is bounded to that layer's subtree —
454 // O(entities-in-layer), never O(store). Either way: sequential, complete
455 // sidecar reads, never a walk-and-parse of the tree.
456 let canonical_folder = default_type_folder(type_);
457 let canonical = self.root.join(&canonical_folder).join(TYPE_INDEX_FILE);
458 let records = if canonical.is_file() {
459 self.read_type_index(&canonical)?
460 } else {
461 self.read_all_type_indexes_in(layer_of_folder(&canonical_folder))?
462 };
463 Ok(records.into_iter().filter(|r| r.type_ == type_).collect())
464 }
465
466 /// Candidate set for a `key=value` frontmatter query, **store-wide**: read
467 /// every type-folder `index.jsonl` sidecar and filter their records. The
468 /// unscoped pre-write dedup primitive; prefer [`Store::find_by_where_in`]
469 /// with a layer scope to stay O(entities-in-layer) on the interactive loop.
470 pub fn find_by_where(&self, key: &str, value: &str) -> Result<Vec<IndexRecord>, StoreError> {
471 self.find_by_where_in(key, value, None)
472 }
473
474 /// Candidate set for a `key=value` frontmatter query, **scoped to one
475 /// layer** when `layer` is `Some`: the sidecar walk is confined to that
476 /// layer's subtree (`<root>/<layer>/`), so the I/O is O(entities-in-layer),
477 /// not O(store records). `None` keeps the store-wide read.
478 ///
479 /// This is what makes `--in <layer>` an I/O scope, not just a result
480 /// filter: a `--where`-only query (no `--type`) used to read every sidecar
481 /// in the store and narrow by layer in memory, breaking the O(entities)
482 /// contract the interactive loop depends on. With a layer in hand we walk
483 /// only that layer's sidecars.
484 pub fn find_by_where_in(
485 &self,
486 key: &str,
487 value: &str,
488 layer: Option<Layer>,
489 ) -> Result<Vec<IndexRecord>, StoreError> {
490 // A `key=value` query can target any frontmatter field across any type,
491 // so within the chosen subtree we still read every type-folder sidecar
492 // and filter. The layer (when given) bounds *which* subtree, turning a
493 // whole-store walk into a single-layer walk.
494 let records = self.read_all_type_indexes_in(layer)?;
495 Ok(records
496 .into_iter()
497 .filter(|r| record_matches_field(r, key, value))
498 .collect())
499 }
500
501 /// Every record across the type-folder `index.jsonl` sidecars, scoped to one
502 /// layer when `layer` is `Some` (the walk is confined to `<root>/<layer>/`)
503 /// else store-wide. Sequential, complete sidecar reads — never a
504 /// walk-and-parse of the content tree.
505 ///
506 /// This is the unfiltered sidecar-enumeration primitive the relationship
507 /// loop sits on: [`crate::graph::backlinks_filtered`] uses it to bound its
508 /// candidate set to the relevant layer (or the whole store) without opening
509 /// the content tree, then confirms each candidate's edge by parsing the file.
510 pub fn sidecar_records(&self, layer: Option<Layer>) -> Result<Vec<IndexRecord>, StoreError> {
511 self.read_all_type_indexes_in(layer)
512 }
513
514 /// Parse a type-folder's `index.jsonl` into [`IndexRecord`]s, applying
515 /// last-write-wins by `path` over any un-compacted lines. The sidecar-read
516 /// primitive every structured query sits on.
517 pub fn read_type_index(&self, index_jsonl: &Path) -> Result<Vec<IndexRecord>, StoreError> {
518 let text = std::fs::read_to_string(index_jsonl).map_err(|e| StoreError::BadTypeIndex {
519 path: index_jsonl.to_path_buf(),
520 message: e.to_string(),
521 })?;
522
523 // Last-write-wins by `path` over un-compacted lines: a later line for
524 // the same path supersedes an earlier one (the jsonl is append-mostly
525 // and only compacted on rebuild). Blank lines are skipped; a non-blank
526 // line that is not a valid IndexRecord is a hard parse error.
527 let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
528 for (i, line) in text.lines().enumerate() {
529 let trimmed = line.trim();
530 if trimmed.is_empty() {
531 continue;
532 }
533 let record: IndexRecord =
534 serde_json::from_str(trimmed).map_err(|e| StoreError::BadTypeIndex {
535 path: index_jsonl.to_path_buf(),
536 message: format!("line {}: {e}", i + 1),
537 })?;
538 by_path.insert(record.path.clone(), record);
539 }
540 // BTreeMap keyed by path → records emerge sorted by path ascending,
541 // a deterministic order independent of line order in the file.
542 Ok(by_path.into_values().collect())
543 }
544
545 /// Resolve a store-relative path to its absolute on-disk path under
546 /// [`root`](Store::root).
547 pub fn abs_path(&self, store_relative: &Path) -> PathBuf {
548 // `Path::join` returns `store_relative` unchanged if it is already
549 // absolute, so passing an absolute path through is a no-op.
550 self.root.join(store_relative)
551 }
552
553 /// Convert an absolute path under the store into its store-relative form.
554 pub fn rel_path(&self, abs: &Path) -> Option<PathBuf> {
555 abs.strip_prefix(&self.root).ok().map(|p| p.to_path_buf())
556 }
557
558 // ── Private helpers ─────────────────────────────────────────────────────
559
560 /// Resolve a caller-supplied folder path (store-relative or absolute) to an
561 /// absolute path under the store root.
562 fn resolve_under_root(&self, folder: &Path) -> PathBuf {
563 if folder.is_absolute() {
564 folder.to_path_buf()
565 } else {
566 self.root.join(folder)
567 }
568 }
569
570 /// Walk a subtree for content `.md` files (skip hidden dirs, skip `index.md`
571 /// / `DB.md` / `log.md`), returning store-relative paths. Used by the layer
572 /// and type-folder walks.
573 fn walk_content_md(&self, root: &Path) -> Result<Vec<PathBuf>, StoreError> {
574 let mut out = Vec::new();
575 for entry in self.md_walker(root).build() {
576 let entry = entry.map_err(|e| StoreError::Search {
577 root: root.to_path_buf(),
578 message: e.to_string(),
579 })?;
580 if !is_file_entry(&entry) {
581 continue;
582 }
583 let path = entry.path();
584 if !has_md_extension(path) {
585 continue;
586 }
587 if is_non_content_basename(path) {
588 continue;
589 }
590 if let Some(rel) = self.rel_path(path) {
591 out.push(rel);
592 }
593 }
594 out.sort();
595 Ok(out)
596 }
597
598 /// Walk the whole store for **every** `.md` file (including `index.md`),
599 /// skipping hidden dirs and the `log/` archive tree. Used by the backlink
600 /// scan, where the literal link text can live in any markdown file.
601 fn walk_all_md(&self) -> Result<Vec<PathBuf>, StoreError> {
602 let mut out = Vec::new();
603 for entry in self.md_walker(&self.root).build() {
604 let entry = entry.map_err(|e| StoreError::Search {
605 root: self.root.clone(),
606 message: e.to_string(),
607 })?;
608 if !is_file_entry(&entry) {
609 continue;
610 }
611 let path = entry.path();
612 if !has_md_extension(path) {
613 continue;
614 }
615 if self.is_in_log_dir(path) {
616 continue;
617 }
618 if let Some(rel) = self.rel_path(path) {
619 out.push(rel);
620 }
621 }
622 out.sort();
623 Ok(out)
624 }
625
626 /// Read and merge every type-folder `index.jsonl` sidecar under `layer`
627 /// when given, else the whole store (skip hidden + `log/`). Each sidecar is
628 /// read with last-write-wins by path; across sidecars, paths are disjoint by
629 /// construction (one sidecar per folder), so a plain concatenation preserves
630 /// completeness. A layer scope confines the walk to `<root>/<layer>/`, which
631 /// is what keeps `find_by_where_in` O(entities-in-layer).
632 fn read_all_type_indexes_in(
633 &self,
634 layer: Option<Layer>,
635 ) -> Result<Vec<IndexRecord>, StoreError> {
636 let mut out = Vec::new();
637 for sidecar in self.find_type_index_files_in(layer)? {
638 out.extend(self.read_type_index(&self.abs_path(&sidecar))?);
639 }
640 Ok(out)
641 }
642
643 /// Locate every `index.jsonl` sidecar under `layer` (when given) else the
644 /// whole store (skip hidden + `log/`), returning store-relative paths. The
645 /// walk root is `<root>/<layer>/` for a scoped read and `self.root` for the
646 /// store-wide read; a non-existent layer subtree yields no sidecars rather
647 /// than walking a missing path.
648 fn find_type_index_files_in(&self, layer: Option<Layer>) -> Result<Vec<PathBuf>, StoreError> {
649 let walk_root = match layer {
650 Some(l) => self.root.join(l.dir_name()),
651 None => self.root.clone(),
652 };
653 // A scoped walk over a layer folder that does not exist yet must be an
654 // empty result, mirroring `walk_layer`'s missing-dir guard — not a walk
655 // error from `ignore` over a nonexistent path.
656 if !walk_root.is_dir() {
657 return Ok(Vec::new());
658 }
659 let mut out = Vec::new();
660 let mut builder = WalkBuilder::new(&walk_root);
661 builder.standard_filters(false).hidden(true);
662 for entry in builder.build() {
663 let entry = entry.map_err(|e| StoreError::Search {
664 root: walk_root.clone(),
665 message: e.to_string(),
666 })?;
667 if !is_file_entry(&entry) {
668 continue;
669 }
670 let path = entry.path();
671 if path.file_name().and_then(|n| n.to_str()) != Some(TYPE_INDEX_FILE) {
672 continue;
673 }
674 if self.is_in_log_dir(path) {
675 continue;
676 }
677 if let Some(rel) = self.rel_path(path) {
678 out.push(rel);
679 }
680 }
681 out.sort();
682 Ok(out)
683 }
684
685 /// A `WalkBuilder` configured for db.md SWEEPs: gitignore/global-ignore are
686 /// OFF (a SWEEP must see every file even if the store is a git repo with a
687 /// `.gitignore`), but hidden files/dirs are skipped.
688 fn md_walker(&self, root: &Path) -> WalkBuilder {
689 let mut builder = WalkBuilder::new(root);
690 builder.standard_filters(false).hidden(true);
691 builder
692 }
693
694 /// True if an absolute path lives under the store's root-level `log/`
695 /// rotation-archive directory.
696 fn is_in_log_dir(&self, abs: &Path) -> bool {
697 match self.rel_path(abs) {
698 Some(rel) => rel.components().next().map(|c| c.as_os_str()) == Some("log".as_ref()),
699 None => false,
700 }
701 }
702
703 /// Read a file's frontmatter `updated` field as an RFC3339 timestamp,
704 /// returning `None` when absent/unparseable. A self-contained reader (does
705 /// not depend on the not-yet-implemented `parser::read_file`); parses the
706 /// leading `---`-fenced YAML block with the same engine the parser uses.
707 fn read_updated(&self, abs: &Path) -> Option<DateTime<FixedOffset>> {
708 let text = std::fs::read_to_string(abs).ok()?;
709 let yaml = frontmatter_block(&text)?;
710 let value: serde_yml::Value = serde_yml::from_str(yaml).ok()?;
711 let raw = value.get("updated")?;
712 value_to_datetime(raw)
713 }
714
715 /// The `<YYYY>/<MM>` shard segment for a sharding type, from its primary
716 /// date field with a `created` fallback. Reads the public `Frontmatter`
717 /// fields directly. `None` when no usable date is present.
718 fn primary_shard_segment(&self, type_: &str, fm: &Frontmatter) -> Option<(String, String)> {
719 // Try the type's primary date field first.
720 if let Some(field) = primary_date_field(type_) {
721 if let Some(v) = fm.extra.get(field) {
722 if let Some(seg) = value_to_year_month(v) {
723 return Some(seg);
724 }
725 }
726 }
727 // Universal fallback: the typed `created` timestamp.
728 fm.created
729 .map(|dt| (format!("{:04}", dt.year()), format!("{:02}", dt.month())))
730 }
731}
732
733// ── Free helpers (no `self`) ────────────────────────────────────────────────
734
735/// True if a walk entry is a regular file (not a dir / symlink-to-dir).
736fn is_file_entry(entry: &ignore::DirEntry) -> bool {
737 entry.file_type().map(|ft| ft.is_file()).unwrap_or(false)
738}
739
740/// True if the path ends in a `.md` extension (case-sensitive — db.md files are
741/// lowercase `.md`).
742fn has_md_extension(path: &Path) -> bool {
743 path.extension().and_then(|e| e.to_str()) == Some("md")
744}
745
746/// True if the basename is a non-content meta file (`DB.md`, `index.md`,
747/// `log.md`) that the content walks must skip.
748fn is_non_content_basename(path: &Path) -> bool {
749 match path.file_name().and_then(|n| n.to_str()) {
750 Some(name) => NON_CONTENT_BASENAMES.contains(&name),
751 None => false,
752 }
753}
754
755/// Append `.md` to a bare name; leave an existing `.md` untouched.
756fn ensure_md_extension(name: &str) -> String {
757 if name.ends_with(".md") {
758 name.to_string()
759 } else {
760 format!("{name}.md")
761 }
762}
763
764/// Render a store-relative path as a wiki-link target string with `/`
765/// separators (never `\`), no leading `./`, no trailing `.md`.
766fn path_to_link_str(target: &Path) -> String {
767 let mut parts: Vec<String> = Vec::new();
768 for comp in target.components() {
769 if let std::path::Component::Normal(os) = comp {
770 if let Some(s) = os.to_str() {
771 parts.push(s.to_string());
772 }
773 }
774 }
775 let mut joined = parts.join("/");
776 if let Some(stripped) = joined.strip_suffix(".md") {
777 joined = stripped.to_string();
778 }
779 joined
780}
781
782/// The canonical default folder for a recognized type, per the SPEC type table
783/// (`email → sources/emails`, `expense → records/expenses`, …). Unrecognized
784/// types fall back to `records/<type>` (the bare type name, no pluralization
785/// guess) — see the store findings on the docstring's looser `<type>` phrasing.
786fn default_type_folder(type_: &str) -> PathBuf {
787 let path = match type_ {
788 // sources
789 "email" => "sources/emails",
790 "transcript" => "sources/transcripts",
791 "pdf-source" => "sources/docs",
792 // records — entities
793 "contact" => "records/contacts",
794 "company" => "records/companies",
795 // records — events
796 "expense" => "records/expenses",
797 "meeting" => "records/meetings",
798 "decision" => "records/decisions",
799 "invoice" => "records/invoices",
800 // wiki — the SPEC type table files a wiki-page under `wiki/<topic>/`,
801 // i.e. ALWAYS a sub-folder, never flat under `wiki/`. A 2-component
802 // `wiki/<file>` path is non-conforming: `index::type_folder_of` /
803 // `validate::type_folder_of` require `<layer>/<type-folder>/<file>` (3
804 // components), so a flat wiki page either crashes write-through
805 // (`on_write` tries to create `index.md` *inside* a file) or is silently
806 // dropped from every catalog by `rebuild_all`. `topic` is the page's
807 // canonical bucket; with only the bare type in hand here, `wiki/topics`
808 // is the deterministic default folder (matches the dogfood store).
809 "wiki-page" => "wiki/topics",
810 // unrecognized: bare type name under records/
811 other => return PathBuf::from("records").join(other),
812 };
813 PathBuf::from(path)
814}
815
816/// The canonical [`Layer`] a `type_` belongs to, derived from its default
817/// type-folder (`email` → `Sources`, `contact` → `Records`, `wiki-page` →
818/// `Wiki`, unrecognized → `Records`). The write path uses this to decide whether
819/// an agent-supplied folder is in the *right* layer for the type before honouring
820/// its sub-folder choice.
821pub fn layer_for_type(type_: &str) -> Layer {
822 layer_of_folder(&default_type_folder(type_)).unwrap_or(Layer::Records)
823}
824
825/// The [`Layer`] a type-folder path lives in, read from its first component
826/// (`sources/` → `Sources`, `records/` → `Records`, `wiki/` → `Wiki`). Used to
827/// bound [`Store::find_by_type`]'s canonical-folder-absent fallback to a single
828/// layer subtree. Returns `None` for a path with no recognized layer prefix;
829/// every value [`default_type_folder`] produces has one, so in practice this is
830/// always `Some` on the call path — `None` degrades to a store-wide read.
831fn layer_of_folder(folder: &Path) -> Option<Layer> {
832 let first = folder.components().next()?.as_os_str().to_str()?;
833 Layer::from_dir_name(first)
834}
835
836/// Infer a content file's canonical `type` from its store-relative path — the
837/// inverse of [`default_type_folder`] and the single source of truth for
838/// path→type inference (the CLI's `fm init` calls this, never re-derives it).
839///
840/// Requires the canonical `<layer>/<type-folder>/<file>` 3-component shape; a
841/// shorter path (a file directly under a layer) or an unknown leading layer
842/// yields `None`.
843///
844/// Recognized `(layer, folder)` pairs map back to their canonical type. For an
845/// unrecognized folder the fallback is the **bare folder name verbatim** (no
846/// pluralization/singularization) so it round-trips with `default_type_folder`,
847/// whose unrecognized fallback is the bare type name (`task` ⇄ `records/task`).
848/// Singularizing here would break that round-trip (`records/tasks` → `task`
849/// while `default_type_folder("task")` → `records/task`). `wiki/<topic>` always
850/// infers `wiki-page`, since every wiki page is filed under a topic folder.
851pub fn infer_type_from_path(rel: &Path) -> Option<String> {
852 let mut comps = rel.components().filter_map(|c| c.as_os_str().to_str());
853 let layer = comps.next()?;
854 if !matches!(layer, "sources" | "records" | "wiki") {
855 return None;
856 }
857 let folder = comps.next()?;
858 // The file itself must be a third component (a real type-folder, not the
859 // file sitting directly under the layer).
860 comps.next()?;
861
862 let mapped = match (layer, folder) {
863 ("sources", "emails") => "email",
864 ("sources", "transcripts") => "transcript",
865 ("sources", "docs") => "pdf-source",
866 ("records", "contacts") => "contact",
867 ("records", "companies") => "company",
868 ("records", "expenses") => "expense",
869 ("records", "meetings") => "meeting",
870 ("records", "decisions") => "decision",
871 ("records", "invoices") => "invoice",
872 // Every wiki page is filed under `wiki/<topic>/`; the type is always
873 // `wiki-page` regardless of the topic-folder name.
874 ("wiki", _) => "wiki-page",
875 // Unrecognized folder: the bare name, verbatim. This is the inverse of
876 // `default_type_folder`'s unrecognized fallback (`other → records/other`)
877 // and the round-trip would break if we pluralized/singularized here.
878 (_, other) => other,
879 };
880 Some(mapped.to_string())
881}
882
883/// The primary date field name for a sharding type (the field whose value
884/// drives `<YYYY>/<MM>`). `None` means "use the `created` fallback only".
885fn primary_date_field(type_: &str) -> Option<&'static str> {
886 match type_ {
887 "email" => Some("date"),
888 "transcript" => Some("recorded_at"),
889 "pdf-source" => Some("received_at"),
890 "expense" | "invoice" | "meeting" => Some("date"),
891 // recognized custom event types have no canonical date field name; they
892 // fall back to `created`.
893 _ => None,
894 }
895}
896
897/// Parse a YAML value into an RFC3339 [`DateTime`], accepting both an explicit
898/// string and a YAML-native scalar rendered to string.
899fn value_to_datetime(value: &serde_yml::Value) -> Option<DateTime<FixedOffset>> {
900 let s = yaml_scalar_string(value)?;
901 DateTime::parse_from_rfc3339(s.trim()).ok()
902}
903
904/// Extract `(YYYY, MM)` from a YAML date/timestamp value. Lenient: matches a
905/// leading `YYYY-MM` so a bare `2026-05-22` date and a full
906/// `2026-05-22T10:00:00-07:00` timestamp both work.
907fn value_to_year_month(value: &serde_yml::Value) -> Option<(String, String)> {
908 let s = yaml_scalar_string(value)?;
909 year_month_from_str(s.trim())
910}
911
912/// `(YYYY, MM)` from the leading `YYYY-MM` of a date string.
913fn year_month_from_str(s: &str) -> Option<(String, String)> {
914 // Hand-roll the leading-`YYYY-MM` parse to avoid a regex compile on the
915 // write path. Require: 4 digits, '-', 2 digits.
916 let bytes = s.as_bytes();
917 if bytes.len() < 7 {
918 return None;
919 }
920 let is_digit = |b: u8| b.is_ascii_digit();
921 if !(is_digit(bytes[0])
922 && is_digit(bytes[1])
923 && is_digit(bytes[2])
924 && is_digit(bytes[3])
925 && bytes[4] == b'-'
926 && is_digit(bytes[5])
927 && is_digit(bytes[6]))
928 {
929 return None;
930 }
931 let month: u8 = (bytes[5] - b'0') * 10 + (bytes[6] - b'0');
932 if !(1..=12).contains(&month) {
933 return None;
934 }
935 Some((s[0..4].to_string(), s[5..7].to_string()))
936}
937
938/// Render a YAML scalar as a string: a real `String` verbatim, otherwise the
939/// value's compact YAML serialization (covers timestamps that the YAML engine
940/// may surface as a non-string scalar).
941fn yaml_scalar_string(value: &serde_yml::Value) -> Option<String> {
942 if let Some(s) = value.as_str() {
943 return Some(s.to_string());
944 }
945 match value {
946 serde_yml::Value::Null => None,
947 serde_yml::Value::Mapping(_) | serde_yml::Value::Sequence(_) => None,
948 other => serde_yml::to_string(other)
949 .ok()
950 .map(|s| s.trim().to_string()),
951 }
952}
953
954/// The YAML frontmatter block of a file: the text between a leading `---` fence
955/// and the next `---` fence, exclusive. `None` if the file does not open with a
956/// `---` fence on its first line.
957fn frontmatter_block(text: &str) -> Option<&str> {
958 // Tolerate a UTF-8 BOM and CRLF, but the fence must be the very first line.
959 let body = text.strip_prefix('\u{feff}').unwrap_or(text);
960 let mut rest = body;
961 // First line must be exactly `---` (allowing trailing CR).
962 let (first, after_first) = split_first_line(rest);
963 if first.trim_end_matches('\r') != "---" {
964 return None;
965 }
966 rest = after_first;
967 let block_start = rest;
968 let mut scanned = 0usize;
969 loop {
970 let (line, after) = split_first_line(rest);
971 if line.trim_end_matches('\r') == "---" {
972 return Some(&block_start[..scanned]);
973 }
974 if after.is_empty() && line.is_empty() {
975 // Reached end of input without a closing fence.
976 return None;
977 }
978 scanned += line.len() + 1; // +1 for the consumed '\n'
979 if after.is_empty() {
980 return None;
981 }
982 rest = after;
983 }
984}
985
986/// Split a string into (first line without its trailing `\n`, remainder after
987/// the `\n`). If there is no newline, the whole string is the line and the
988/// remainder is empty.
989fn split_first_line(s: &str) -> (&str, &str) {
990 match s.find('\n') {
991 Some(i) => (&s[..i], &s[i + 1..]),
992 None => (s, ""),
993 }
994}
995
996/// True if an [`IndexRecord`] has a field `key` equal to `value`, checking the
997/// typed columns first and then the flattened `fields` map.
998fn record_matches_field(record: &IndexRecord, key: &str, value: &str) -> bool {
999 match key {
1000 "type" => record.type_ == value,
1001 "summary" => record.summary == value,
1002 "path" => record.path.to_string_lossy() == value,
1003 "created" => timestamp_matches(record.created, value),
1004 "updated" => timestamp_matches(record.updated, value),
1005 "tags" => record.tags.iter().any(|t| t == value),
1006 "links" => record.links.iter().any(|l| l == value),
1007 other => record
1008 .fields
1009 .get(other)
1010 .map(|v| json_value_matches(v, value))
1011 .unwrap_or(false),
1012 }
1013}
1014
1015/// Compare a record's `created`/`updated` instant against a query `value`.
1016///
1017/// db.md files write timestamps in several equivalent RFC3339 spellings — most
1018/// commonly the `Z` UTC designator (`2026-05-01T00:00:00Z`) but also an explicit
1019/// offset (`...+00:00`, `...-07:00`). A naive `record.created.to_rfc3339() ==
1020/// value` reformats only one side: chrono renders a UTC instant as `+00:00`, so
1021/// the `Z` form an agent reads straight out of the file would never match. We
1022/// instead parse `value` as RFC3339 and compare instants, where `Z` and `+00:00`
1023/// (and any same-instant offset) are equal. A `value` that is not valid RFC3339
1024/// can never equal a real timestamp, so it falls through to `false`.
1025fn timestamp_matches(stored: Option<DateTime<FixedOffset>>, value: &str) -> bool {
1026 match (stored, DateTime::parse_from_rfc3339(value)) {
1027 (Some(stored), Ok(queried)) => stored == queried,
1028 _ => false,
1029 }
1030}
1031
1032/// Compare a JSON field value against a query string. A string matches
1033/// verbatim; scalars match their textual form; an array matches if any element
1034/// matches (so a list-valued frontmatter field is membership-queried).
1035fn json_value_matches(v: &serde_json::Value, value: &str) -> bool {
1036 match v {
1037 serde_json::Value::String(s) => s == value,
1038 serde_json::Value::Bool(b) => b.to_string() == value,
1039 serde_json::Value::Number(n) => n.to_string() == value,
1040 serde_json::Value::Array(items) => items.iter().any(|i| json_value_matches(i, value)),
1041 serde_json::Value::Null => value.is_empty(),
1042 serde_json::Value::Object(_) => false,
1043 }
1044}
1045
1046#[cfg(test)]
1047mod tests {
1048 use super::*;
1049 use std::fs;
1050 use tempfile::{tempdir, TempDir};
1051
1052 // ── Fixtures ────────────────────────────────────────────────────────────
1053
1054 /// Write `contents` to `<root>/<rel>`, creating parent dirs. Returns the
1055 /// store-relative path for convenient assertions.
1056 fn write(root: &Path, rel: &str, contents: &str) -> PathBuf {
1057 let abs = root.join(rel);
1058 fs::create_dir_all(abs.parent().unwrap()).unwrap();
1059 fs::write(&abs, contents).unwrap();
1060 PathBuf::from(rel)
1061 }
1062
1063 /// A minimal content file with the given `updated` timestamp in frontmatter.
1064 fn content_md(updated: &str) -> String {
1065 format!(
1066 "---\ntype: note\ncreated: {updated}\nupdated: {updated}\nsummary: a note\n---\n\nbody\n"
1067 )
1068 }
1069
1070 /// A bare directory with a `DB.md` marker (valid `db-md` frontmatter so the
1071 /// real parser is exercised).
1072 fn empty_store() -> TempDir {
1073 let dir = tempdir().unwrap();
1074 fs::write(
1075 dir.path().join("DB.md"),
1076 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n",
1077 )
1078 .unwrap();
1079 dir
1080 }
1081
1082 /// Open a store rooted at a TempDir; panics if `open` rejects it.
1083 fn open(dir: &TempDir) -> Store {
1084 Store::open(dir.path()).expect("fixture should be a valid store")
1085 }
1086
1087 fn rels(paths: &[PathBuf]) -> Vec<String> {
1088 paths
1089 .iter()
1090 .map(|p| p.to_string_lossy().replace('\\', "/"))
1091 .collect()
1092 }
1093
1094 // ── Layer ───────────────────────────────────────────────────────────────
1095
1096 #[test]
1097 fn layer_dir_name_and_parse_are_inverse() {
1098 for layer in Layer::all() {
1099 assert_eq!(Layer::from_dir_name(layer.dir_name()), Some(layer));
1100 }
1101 assert_eq!(Layer::Sources.dir_name(), "sources");
1102 assert_eq!(Layer::Records.dir_name(), "records");
1103 assert_eq!(Layer::Wiki.dir_name(), "wiki");
1104 assert_eq!(Layer::from_dir_name("log"), None);
1105 assert_eq!(Layer::from_dir_name("Sources"), None); // case-sensitive
1106 }
1107
1108 #[test]
1109 fn layer_order_is_canonical() {
1110 // stats keys a BTreeMap on Layer; the sort order must be sources<records<wiki.
1111 let mut v = [Layer::Wiki, Layer::Sources, Layer::Records];
1112 v.sort();
1113 assert_eq!(v, [Layer::Sources, Layer::Records, Layer::Wiki]);
1114 }
1115
1116 // ── is_db_md_store / open ────────────────────────────────────────────────
1117
1118 #[test]
1119 fn is_store_true_only_with_uppercase_marker() {
1120 let dir = tempdir().unwrap();
1121 assert!(
1122 !Store::is_db_md_store(dir.path()),
1123 "no marker → not a store"
1124 );
1125
1126 fs::write(dir.path().join("DB.md"), "---\ntype: db-md\n---\n").unwrap();
1127 assert!(Store::is_db_md_store(dir.path()), "uppercase DB.md → store");
1128 }
1129
1130 #[test]
1131 fn is_store_false_for_lowercase_db_md() {
1132 // The case-sensitivity contract: a lowercase db.md is the spec name, not
1133 // a marker — even on a case-insensitive filesystem where Path::exists
1134 // would lie. This test must pass on macOS (case-insensitive) too.
1135 let dir = tempdir().unwrap();
1136 fs::write(dir.path().join("db.md"), "---\ntype: db-md\n---\n").unwrap();
1137 assert!(
1138 !Store::is_db_md_store(dir.path()),
1139 "lowercase db.md must NOT be treated as a store marker"
1140 );
1141 assert!(Store::open(dir.path()).is_err());
1142 }
1143
1144 #[test]
1145 fn is_store_false_when_db_md_is_a_directory() {
1146 let dir = tempdir().unwrap();
1147 fs::create_dir(dir.path().join("DB.md")).unwrap();
1148 assert!(
1149 !Store::is_db_md_store(dir.path()),
1150 "a directory named DB.md is not the file marker"
1151 );
1152 }
1153
1154 #[test]
1155 fn open_rejects_non_store_with_path() {
1156 let dir = tempdir().unwrap();
1157 let err = Store::open(dir.path()).unwrap_err();
1158 assert_eq!(err.path, dir.path());
1159 }
1160
1161 #[test]
1162 fn open_succeeds_and_parses_config() {
1163 let dir = tempdir().unwrap();
1164 // A DB.md whose ## Policies declares a frozen page — proves open()
1165 // actually parsed the config rather than substituting a default.
1166 fs::write(
1167 dir.path().join("DB.md"),
1168 "---\ntype: db-md\nscope: company\nowner: Test\n---\n\n# Store\n\n\
1169 ## Policies\n\n### Frozen pages\n- records/decisions/q1.md\n",
1170 )
1171 .unwrap();
1172 let store = Store::open(dir.path()).unwrap();
1173 assert_eq!(store.root, dir.path());
1174 assert!(
1175 store
1176 .config
1177 .frozen_pages
1178 .iter()
1179 .any(|p| p == Path::new("records/decisions/q1.md")),
1180 "open() must surface DB.md ## Policies, got {:?}",
1181 store.config.frozen_pages
1182 );
1183 }
1184
1185 // ── walk / walk_layer / walk_type_folder ─────────────────────────────────
1186
1187 #[test]
1188 fn walk_collects_content_across_layers_skipping_meta_and_log() {
1189 let dir = empty_store();
1190 let root = dir.path();
1191 write(
1192 root,
1193 "sources/emails/2026/05/a.md",
1194 &content_md("2026-05-01T00:00:00Z"),
1195 );
1196 write(
1197 root,
1198 "records/contacts/sarah.md",
1199 &content_md("2026-05-02T00:00:00Z"),
1200 );
1201 write(
1202 root,
1203 "wiki/people/sarah.md",
1204 &content_md("2026-05-03T00:00:00Z"),
1205 );
1206 // Things walk() must SKIP:
1207 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // catalog
1208 write(root, "index.md", "---\ntype: index\n---\n"); // root catalog
1209 write(root, "log.md", "---\ntype: log\n---\n"); // log
1210 write(root, "log/2026-04.md", "---\ntype: log\n---\n"); // rotated log archive
1211 write(
1212 root,
1213 "sources/.hidden/secret.md",
1214 &content_md("2026-05-09T00:00:00Z"),
1215 ); // hidden dir
1216 write(root, "records/contacts/notes.txt", "not markdown"); // non-md
1217
1218 let store = open(&dir);
1219 let got = rels(&store.walk().unwrap());
1220 assert_eq!(
1221 got,
1222 vec![
1223 "records/contacts/sarah.md".to_string(),
1224 "sources/emails/2026/05/a.md".to_string(),
1225 "wiki/people/sarah.md".to_string(),
1226 ]
1227 );
1228 }
1229
1230 #[test]
1231 fn walk_layer_is_scoped() {
1232 let dir = empty_store();
1233 let root = dir.path();
1234 write(
1235 root,
1236 "sources/emails/2026/05/a.md",
1237 &content_md("2026-05-01T00:00:00Z"),
1238 );
1239 write(
1240 root,
1241 "records/contacts/sarah.md",
1242 &content_md("2026-05-02T00:00:00Z"),
1243 );
1244 let store = open(&dir);
1245
1246 assert_eq!(
1247 rels(&store.walk_layer(Layer::Sources).unwrap()),
1248 vec!["sources/emails/2026/05/a.md".to_string()]
1249 );
1250 assert_eq!(
1251 rels(&store.walk_layer(Layer::Records).unwrap()),
1252 vec!["records/contacts/sarah.md".to_string()]
1253 );
1254 // A layer with no directory is empty, not an error.
1255 assert!(store.walk_layer(Layer::Wiki).unwrap().is_empty());
1256 }
1257
1258 #[test]
1259 fn walk_type_folder_recurses_shards_and_accepts_abs_or_rel() {
1260 let dir = empty_store();
1261 let root = dir.path();
1262 write(
1263 root,
1264 "sources/emails/2026/05/a.md",
1265 &content_md("2026-05-01T00:00:00Z"),
1266 );
1267 write(
1268 root,
1269 "sources/emails/2026/06/b.md",
1270 &content_md("2026-06-01T00:00:00Z"),
1271 );
1272 write(root, "sources/emails/index.md", "---\ntype: index\n---\n"); // skipped
1273 // A different type folder must not leak in.
1274 write(
1275 root,
1276 "sources/docs/2026/05/c.md",
1277 &content_md("2026-05-04T00:00:00Z"),
1278 );
1279 let store = open(&dir);
1280
1281 let expected = vec![
1282 "sources/emails/2026/05/a.md".to_string(),
1283 "sources/emails/2026/06/b.md".to_string(),
1284 ];
1285 // Relative folder arg.
1286 assert_eq!(
1287 rels(&store.walk_type_folder(Path::new("sources/emails")).unwrap()),
1288 expected
1289 );
1290 // Absolute folder arg under the store resolves identically.
1291 assert_eq!(
1292 rels(
1293 &store
1294 .walk_type_folder(&root.join("sources/emails"))
1295 .unwrap()
1296 ),
1297 expected
1298 );
1299 }
1300
1301 // ── recent_in_type_folder ────────────────────────────────────────────────
1302
1303 #[test]
1304 fn recent_orders_by_updated_desc_then_path_and_caps() {
1305 let dir = empty_store();
1306 let root = dir.path();
1307 // newest
1308 write(
1309 root,
1310 "records/meetings/2026/05/c.md",
1311 &content_md("2026-05-03T00:00:00Z"),
1312 );
1313 // tie on updated — path asc decides (a before b)
1314 write(
1315 root,
1316 "records/meetings/2026/05/a.md",
1317 &content_md("2026-05-02T00:00:00Z"),
1318 );
1319 write(
1320 root,
1321 "records/meetings/2026/05/b.md",
1322 &content_md("2026-05-02T00:00:00Z"),
1323 );
1324 // oldest
1325 write(
1326 root,
1327 "records/meetings/2026/04/z.md",
1328 &content_md("2026-04-01T00:00:00Z"),
1329 );
1330 let store = open(&dir);
1331
1332 let all = rels(
1333 &store
1334 .recent_in_type_folder(Path::new("records/meetings"), 10)
1335 .unwrap(),
1336 );
1337 assert_eq!(
1338 all,
1339 vec![
1340 "records/meetings/2026/05/c.md".to_string(), // newest
1341 "records/meetings/2026/05/a.md".to_string(), // tie, path asc
1342 "records/meetings/2026/05/b.md".to_string(),
1343 "records/meetings/2026/04/z.md".to_string(), // oldest
1344 ]
1345 );
1346
1347 // Cap takes the n most-recent.
1348 let top2 = rels(
1349 &store
1350 .recent_in_type_folder(Path::new("records/meetings"), 2)
1351 .unwrap(),
1352 );
1353 assert_eq!(
1354 top2,
1355 vec![
1356 "records/meetings/2026/05/c.md".to_string(),
1357 "records/meetings/2026/05/a.md".to_string(),
1358 ]
1359 );
1360 }
1361
1362 #[test]
1363 fn recent_sorts_undated_files_last() {
1364 let dir = empty_store();
1365 let root = dir.path();
1366 write(
1367 root,
1368 "records/contacts/dated.md",
1369 &content_md("2026-05-01T00:00:00Z"),
1370 );
1371 // No `updated` field at all.
1372 write(
1373 root,
1374 "records/contacts/undated.md",
1375 "---\ntype: contact\nsummary: x\n---\nbody\n",
1376 );
1377 let store = open(&dir);
1378 let got = rels(
1379 &store
1380 .recent_in_type_folder(Path::new("records/contacts"), 10)
1381 .unwrap(),
1382 );
1383 assert_eq!(
1384 got,
1385 vec![
1386 "records/contacts/dated.md".to_string(),
1387 "records/contacts/undated.md".to_string(),
1388 ],
1389 "a file with a real `updated` must outrank one with none"
1390 );
1391 }
1392
1393 // ── type_shards ──────────────────────────────────────────────────────────
1394
1395 #[test]
1396 fn type_shards_classification() {
1397 let dir = empty_store();
1398 let store = open(&dir);
1399 for t in [
1400 "email",
1401 "transcript",
1402 "pdf-source",
1403 "expense",
1404 "invoice",
1405 "meeting",
1406 "order",
1407 "ticket",
1408 "transaction",
1409 ] {
1410 assert!(store.type_shards(t), "{t} should shard");
1411 }
1412 for t in [
1413 "contact",
1414 "company",
1415 "decision",
1416 "wiki-page",
1417 "index",
1418 "log",
1419 "db-md",
1420 "proposal",
1421 ] {
1422 assert!(!store.type_shards(t), "{t} should stay flat");
1423 }
1424 }
1425
1426 // ── shard_path_for ───────────────────────────────────────────────────────
1427
1428 fn fm_with_extra(key: &str, value: &str) -> Frontmatter {
1429 let mut fm = Frontmatter::default();
1430 fm.extra
1431 .insert(key.to_string(), serde_yml::Value::String(value.to_string()));
1432 fm
1433 }
1434
1435 fn fm_with_created(rfc3339: &str) -> Frontmatter {
1436 Frontmatter {
1437 created: Some(DateTime::parse_from_rfc3339(rfc3339).unwrap()),
1438 ..Default::default()
1439 }
1440 }
1441
1442 #[test]
1443 fn shard_path_uses_primary_date_field_per_type() {
1444 let dir = empty_store();
1445 let store = open(&dir);
1446
1447 // expense.date → records/expenses/<YYYY>/<MM>/
1448 let p = store
1449 .shard_path_for("expense", &fm_with_extra("date", "2026-05-22"), "lunch")
1450 .unwrap();
1451 assert_eq!(p, PathBuf::from("records/expenses/2026/05/lunch.md"));
1452
1453 // email.date → sources/emails/<YYYY>/<MM>/
1454 let p = store
1455 .shard_path_for(
1456 "email",
1457 &fm_with_extra("date", "2026-11-02T09:00:00-07:00"),
1458 "e1",
1459 )
1460 .unwrap();
1461 assert_eq!(p, PathBuf::from("sources/emails/2026/11/e1.md"));
1462
1463 // transcript.recorded_at → sources/transcripts/<YYYY>/<MM>/
1464 let p = store
1465 .shard_path_for(
1466 "transcript",
1467 &fm_with_extra("recorded_at", "2025-01-15T12:00:00Z"),
1468 "t1",
1469 )
1470 .unwrap();
1471 assert_eq!(p, PathBuf::from("sources/transcripts/2025/01/t1.md"));
1472 }
1473
1474 #[test]
1475 fn shard_path_falls_back_to_created() {
1476 let dir = empty_store();
1477 let store = open(&dir);
1478 // meeting with no `date` field but a `created` timestamp.
1479 let p = store
1480 .shard_path_for(
1481 "meeting",
1482 &fm_with_created("2024-07-09T08:30:00-04:00"),
1483 "sync",
1484 )
1485 .unwrap();
1486 assert_eq!(p, PathBuf::from("records/meetings/2024/07/sync.md"));
1487 }
1488
1489 #[test]
1490 fn shard_path_primary_field_wins_over_created() {
1491 let dir = empty_store();
1492 let store = open(&dir);
1493 let mut fm = fm_with_created("2020-01-01T00:00:00Z");
1494 fm.extra
1495 .insert("date".into(), serde_yml::Value::String("2026-05-22".into()));
1496 let p = store.shard_path_for("expense", &fm, "x").unwrap();
1497 // The primary `date` (2026/05), not `created` (2020/01), drives the shard.
1498 assert_eq!(p, PathBuf::from("records/expenses/2026/05/x.md"));
1499 }
1500
1501 #[test]
1502 fn shard_path_flat_types_have_no_shard_segment() {
1503 let dir = empty_store();
1504 let store = open(&dir);
1505 // A contact has a `created` date, but contacts stay flat.
1506 let p = store
1507 .shard_path_for(
1508 "contact",
1509 &fm_with_created("2026-05-22T00:00:00Z"),
1510 "sarah-chen",
1511 )
1512 .unwrap();
1513 assert_eq!(p, PathBuf::from("records/contacts/sarah-chen.md"));
1514
1515 // wiki-page is flat (no date shard) but still files under a type-folder:
1516 // `wiki/topics/<name>.md`, NEVER flat as `wiki/<name>.md`. A 2-component
1517 // path is invisible to the index/validate type-folder model.
1518 let p = store
1519 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1520 .unwrap();
1521 assert_eq!(p, PathBuf::from("wiki/topics/renewal-theme.md"));
1522 }
1523
1524 /// Regression: a wiki-page written through the toolkit's own path
1525 /// computation must land at a path the index + validate type-folder model
1526 /// accepts. `shard_path_for("wiki-page", …)` previously returned a
1527 /// 2-component `wiki/<file>` path, which `type_folder_of` (in both `index`
1528 /// and `validate`) treats as "no type-folder" — so the page either crashed
1529 /// `Index::on_write` (it tried to create `index.md` inside a file) or was
1530 /// silently dropped from every catalog by `Index::rebuild_all`. The
1531 /// computed path must have 3 components: `<layer>/<type-folder>/<file>`.
1532 #[test]
1533 fn shard_path_wiki_page_is_indexable_three_component_path() {
1534 let dir = empty_store();
1535 let store = open(&dir);
1536 let p = store
1537 .shard_path_for("wiki-page", &Frontmatter::default(), "renewal-theme")
1538 .unwrap();
1539 // First two components are a layer + a non-empty type-folder segment;
1540 // the file is the third. This is exactly the shape `type_folder_of`
1541 // (`comps.len() >= 3`, `comps[0]` a known layer) requires.
1542 let comps: Vec<&str> = p.iter().filter_map(|c| c.to_str()).collect();
1543 assert_eq!(
1544 comps.len(),
1545 3,
1546 "wiki-page path must be <layer>/<type-folder>/<file>, got {p:?}"
1547 );
1548 assert_eq!(comps[0], "wiki", "first component must be the wiki layer");
1549 assert!(
1550 !comps[1].is_empty() && comps[1] != "renewal-theme.md",
1551 "second component must be a real type-folder, not the file: {p:?}"
1552 );
1553 assert!(
1554 comps[2].ends_with(".md"),
1555 "third component must be the .md file: {p:?}"
1556 );
1557 }
1558
1559 #[test]
1560 fn shard_path_preserves_and_adds_md_extension() {
1561 let dir = empty_store();
1562 let store = open(&dir);
1563 let with = store
1564 .shard_path_for("contact", &Frontmatter::default(), "sarah.md")
1565 .unwrap();
1566 let without = store
1567 .shard_path_for("contact", &Frontmatter::default(), "sarah")
1568 .unwrap();
1569 assert_eq!(with, PathBuf::from("records/contacts/sarah.md"));
1570 assert_eq!(without, PathBuf::from("records/contacts/sarah.md"));
1571 }
1572
1573 #[test]
1574 fn shard_path_errors_when_sharding_type_has_no_date() {
1575 let dir = empty_store();
1576 let store = open(&dir);
1577 // expense shards, but no `date` and no `created` → NoShardDate.
1578 let err = store
1579 .shard_path_for("expense", &Frontmatter::default(), "mystery")
1580 .unwrap_err();
1581 match err {
1582 StoreError::NoShardDate { file } => {
1583 assert_eq!(file, PathBuf::from("records/expenses/mystery.md"));
1584 }
1585 other => panic!("expected NoShardDate, got {other:?}"),
1586 }
1587 }
1588
1589 // ── find_links_to ────────────────────────────────────────────────────────
1590
1591 #[test]
1592 fn find_links_to_matches_all_accepted_spellings() {
1593 let dir = empty_store();
1594 let root = dir.path();
1595 let target = "records/contacts/sarah-chen";
1596
1597 // Plain link.
1598 write(
1599 root,
1600 "wiki/people/sarah.md",
1601 &format!("---\ntype: wiki-page\nsummary: s\n---\nSee [[{target}]].\n"),
1602 );
1603 // Link with display text.
1604 write(
1605 root,
1606 "records/meetings/2026/05/m.md",
1607 &format!("---\ntype: meeting\nsummary: s\n---\nWith [[{target}|Sarah]].\n"),
1608 );
1609 // Link with .md extension (accepted, warned by validate).
1610 write(
1611 root,
1612 "wiki/themes/t.md",
1613 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}.md]]\n"),
1614 );
1615 // A catalog/index file also contains the link literally — included.
1616 write(
1617 root,
1618 "records/contacts/index.md",
1619 &format!("---\ntype: index\n---\n- [[{target}]] — Sarah\n"),
1620 );
1621 // No link to the target.
1622 write(
1623 root,
1624 "wiki/people/elena.md",
1625 "---\ntype: wiki-page\nsummary: s\n---\nNo links here.\n",
1626 );
1627 // Short-form link must NOT match the full-path target.
1628 write(
1629 root,
1630 "wiki/people/bob.md",
1631 "---\ntype: wiki-page\nsummary: s\n---\n[[sarah-chen]]\n",
1632 );
1633 // A longer path that merely starts with the target must NOT match
1634 // (boundary correctness): target `sarah-chen` vs `sarah-chen-jr`.
1635 write(
1636 root,
1637 "wiki/people/jr.md",
1638 &format!("---\ntype: wiki-page\nsummary: s\n---\n[[{target}-jr]]\n"),
1639 );
1640
1641 let store = open(&dir);
1642 let got = rels(&store.find_links_to(Path::new(target)).unwrap());
1643 assert_eq!(
1644 got,
1645 vec![
1646 "records/contacts/index.md".to_string(),
1647 "records/meetings/2026/05/m.md".to_string(),
1648 "wiki/people/sarah.md".to_string(),
1649 "wiki/themes/t.md".to_string(),
1650 ]
1651 );
1652 }
1653
1654 #[test]
1655 fn find_links_to_distinguishes_sibling_paths() {
1656 // Two contacts whose paths share a prefix; a link to one must not be
1657 // reported as a link to the other.
1658 let dir = empty_store();
1659 let root = dir.path();
1660 write(
1661 root,
1662 "wiki/a.md",
1663 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah]]\n",
1664 );
1665 write(
1666 root,
1667 "wiki/b.md",
1668 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1669 );
1670 let store = open(&dir);
1671
1672 assert_eq!(
1673 rels(
1674 &store
1675 .find_links_to(Path::new("records/contacts/sarah"))
1676 .unwrap()
1677 ),
1678 vec!["wiki/a.md".to_string()]
1679 );
1680 assert_eq!(
1681 rels(
1682 &store
1683 .find_links_to(Path::new("records/contacts/sarah-chen"))
1684 .unwrap()
1685 ),
1686 vec!["wiki/b.md".to_string()]
1687 );
1688 }
1689
1690 // ── find_links_to_any (batch — the O(changed × store) fix) ─────────────────
1691
1692 /// The working-set validate's incoming-linker discovery runs through
1693 /// `find_links_to_any` over the WHOLE changed set in one pass. This pins the
1694 /// batch contract that makes that single-pass behavior correct: the result is
1695 /// the union of incoming linkers across every target, with per-target
1696 /// boundary correctness preserved (no alternation arm bleeds into a
1697 /// prefix-sharing sibling). If a regression reverts the batch finder to a
1698 /// per-object loop, the union below would still hold — but the boundary +
1699 /// union-equivalence assertions are what guard the *correctness* of folding N
1700 /// scans into one regex.
1701 #[test]
1702 fn find_links_to_any_returns_the_union_with_boundary_correctness() {
1703 let dir = empty_store();
1704 let root = dir.path();
1705
1706 // Two distinct targets, each with its own linker.
1707 write(
1708 root,
1709 "wiki/links-sarah.md",
1710 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1711 );
1712 write(
1713 root,
1714 "wiki/links-acme.md",
1715 "---\ntype: wiki-page\nsummary: s\n---\nDeal with [[records/companies/acme|Acme]].\n",
1716 );
1717 // One file links to BOTH targets — must appear exactly once (deduped),
1718 // proving the per-file early-exit folds multiple-target hits into a
1719 // single result row rather than one row per matched target.
1720 write(
1721 root,
1722 "records/meetings/2026/05/m.md",
1723 "---\ntype: meeting\nsummary: s\n---\n[[records/contacts/sarah-chen]] re \
1724 [[records/companies/acme]]\n",
1725 );
1726 // A prefix-sharing sibling of a target: a link to `sarah-chen-jr` must NOT
1727 // be reported as a link to `sarah-chen` even though the alternation now
1728 // carries `sarah-chen` as one arm.
1729 write(
1730 root,
1731 "wiki/links-jr.md",
1732 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen-jr]]\n",
1733 );
1734 // A file that links to neither requested target.
1735 write(
1736 root,
1737 "wiki/unrelated.md",
1738 "---\ntype: wiki-page\nsummary: s\n---\n[[wiki/themes/spend]]\n",
1739 );
1740
1741 let store = open(&dir);
1742 let targets = vec![
1743 PathBuf::from("records/contacts/sarah-chen"),
1744 PathBuf::from("records/companies/acme"),
1745 ];
1746
1747 let got = rels(&store.find_links_to_any(&targets).unwrap());
1748 assert_eq!(
1749 got,
1750 vec![
1751 "records/meetings/2026/05/m.md".to_string(),
1752 "wiki/links-acme.md".to_string(),
1753 "wiki/links-sarah.md".to_string(),
1754 ],
1755 "batch finder must return the deduped union of linkers across all \
1756 targets, excluding the prefix-sibling and the unrelated file"
1757 );
1758
1759 // Equivalence: the batch result must equal the union of the per-target
1760 // single finder. This is the property the working-set path relies on
1761 // when it folds one-scan-per-object into one scan for the whole set.
1762 let mut union: std::collections::BTreeSet<PathBuf> = std::collections::BTreeSet::new();
1763 for t in &targets {
1764 for linker in store.find_links_to(t).unwrap() {
1765 union.insert(linker);
1766 }
1767 }
1768 assert_eq!(
1769 rels(&union.into_iter().collect::<Vec<_>>()),
1770 got,
1771 "find_links_to_any must equal the union of per-target find_links_to"
1772 );
1773 }
1774
1775 /// An empty target set must scan nothing and find nothing — and crucially
1776 /// must NOT compile to a match-everything empty regex (which would report
1777 /// every `.md` as a linker). This is the empty-working-set fast path the
1778 /// `validate` loop hits when nothing changed.
1779 #[test]
1780 fn find_links_to_any_empty_targets_matches_nothing() {
1781 let dir = empty_store();
1782 let root = dir.path();
1783 write(
1784 root,
1785 "wiki/a.md",
1786 "---\ntype: wiki-page\nsummary: s\n---\n[[records/contacts/sarah-chen]]\n",
1787 );
1788 let store = open(&dir);
1789
1790 assert!(
1791 store.find_links_to_any(&[]).unwrap().is_empty(),
1792 "no targets ⇒ no linkers (an empty pattern must not match every file)"
1793 );
1794 // A set of only empty/non-link targets is likewise a no-op, not a
1795 // match-everything.
1796 assert!(
1797 store
1798 .find_links_to_any(&[PathBuf::from(""), PathBuf::from("./")])
1799 .unwrap()
1800 .is_empty(),
1801 "targets that render to empty link text contribute no alternation arm"
1802 );
1803 }
1804
1805 // ── read_type_index ──────────────────────────────────────────────────────
1806
1807 #[test]
1808 fn read_type_index_parses_records_and_flattens_fields() {
1809 let dir = empty_store();
1810 let root = dir.path();
1811 let jsonl = "\
1812{\"path\":\"records/expenses/2026/05/a.md\",\"type\":\"expense\",\"summary\":\"lunch\",\"tags\":[\"meals\"],\"links\":[\"records/companies/acme\"],\"created\":\"2026-05-01T00:00:00Z\",\"updated\":\"2026-05-01T00:00:00Z\",\"vendor\":\"acme\",\"amount\":42}
1813{\"path\":\"records/expenses/2026/05/b.md\",\"type\":\"expense\",\"summary\":\"taxi\",\"created\":null,\"updated\":null,\"vendor\":\"yellow\"}
1814";
1815 let p = write(root, "records/expenses/index.jsonl", jsonl);
1816 let store = open(&dir);
1817 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1818
1819 assert_eq!(recs.len(), 2);
1820 // Sorted by path asc.
1821 assert_eq!(recs[0].path, PathBuf::from("records/expenses/2026/05/a.md"));
1822 assert_eq!(recs[0].type_, "expense");
1823 assert_eq!(recs[0].summary, "lunch");
1824 assert_eq!(recs[0].tags, vec!["meals".to_string()]);
1825 assert_eq!(recs[0].links, vec!["records/companies/acme".to_string()]);
1826 assert!(recs[0].created.is_some());
1827 // Extra (non-typed) frontmatter flattens into `fields`.
1828 assert_eq!(
1829 recs[0].fields.get("vendor"),
1830 Some(&serde_json::json!("acme"))
1831 );
1832 assert_eq!(recs[0].fields.get("amount"), Some(&serde_json::json!(42)));
1833 // Defaults: missing tags/links → empty.
1834 assert!(recs[1].tags.is_empty());
1835 assert!(recs[1].links.is_empty());
1836 }
1837
1838 #[test]
1839 fn read_type_index_last_write_wins_and_skips_blanks() {
1840 let dir = empty_store();
1841 let root = dir.path();
1842 // Same path twice; the second line supersedes the first. A blank line
1843 // in between must be ignored, not error.
1844 let jsonl = "\
1845{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"old\",\"created\":null,\"updated\":null}
1846
1847{\"path\":\"records/contacts/sarah.md\",\"type\":\"contact\",\"summary\":\"new\",\"created\":null,\"updated\":null}
1848";
1849 let p = write(root, "records/contacts/index.jsonl", jsonl);
1850 let store = open(&dir);
1851 let recs = store.read_type_index(&store.abs_path(&p)).unwrap();
1852 assert_eq!(recs.len(), 1, "duplicate path collapses to one record");
1853 assert_eq!(recs[0].summary, "new", "later line must win");
1854 }
1855
1856 #[test]
1857 fn read_type_index_errors_on_malformed_line() {
1858 let dir = empty_store();
1859 let root = dir.path();
1860 let p = write(root, "records/contacts/index.jsonl", "{not valid json}\n");
1861 let store = open(&dir);
1862 let err = store.read_type_index(&store.abs_path(&p)).unwrap_err();
1863 assert!(matches!(err, StoreError::BadTypeIndex { .. }));
1864 }
1865
1866 // ── find_by_type / find_by_where ─────────────────────────────────────────
1867
1868 fn jsonl_line(path: &str, type_: &str, summary: &str, extra: &str) -> String {
1869 format!(
1870 "{{\"path\":\"{path}\",\"type\":\"{type_}\",\"summary\":\"{summary}\",\"created\":null,\"updated\":null{extra}}}\n"
1871 )
1872 }
1873
1874 #[test]
1875 fn find_by_type_reads_canonical_folder_sidecar() {
1876 let dir = empty_store();
1877 let root = dir.path();
1878 // Canonical folder for `contact` is records/contacts.
1879 write(
1880 root,
1881 "records/contacts/index.jsonl",
1882 &(jsonl_line("records/contacts/sarah.md", "contact", "Sarah", "")
1883 + &jsonl_line("records/contacts/elena.md", "contact", "Elena", "")),
1884 );
1885 // A different type's sidecar must not leak into a contact query.
1886 write(
1887 root,
1888 "records/companies/index.jsonl",
1889 &jsonl_line("records/companies/acme.md", "company", "Acme", ""),
1890 );
1891 let store = open(&dir);
1892 let recs = store.find_by_type("contact").unwrap();
1893 let names: Vec<_> = recs.iter().map(|r| r.summary.clone()).collect();
1894 assert_eq!(names, vec!["Elena".to_string(), "Sarah".to_string()]); // path-sorted
1895 assert!(recs.iter().all(|r| r.type_ == "contact"));
1896 }
1897
1898 #[test]
1899 fn find_by_type_canonical_absent_falls_back_within_the_layer_only() {
1900 let dir = empty_store();
1901 let root = dir.path();
1902 // A custom `proposal` record filed under a non-canonical folder NAME
1903 // (the natural plural `records/proposals/`) inside the records layer.
1904 // `default_type_folder("proposal")` = `records/proposal` (bare type, no
1905 // pluralization guess), so the canonical sidecar does not exist and
1906 // `find_by_type` falls back. The fallback is bounded to the type's
1907 // layer (records), so this record — same layer, non-canonical folder —
1908 // is still found: completeness within the layer holds.
1909 write(
1910 root,
1911 "records/proposals/index.jsonl",
1912 &jsonl_line("records/proposals/p1.md", "proposal", "Q3 proposal", ""),
1913 );
1914 // A DECOY of the SAME type sitting in a DIFFERENT layer (sources/). The
1915 // old whole-store fallback read every sidecar in the store and would
1916 // have leaked this into the result; the layer-bounded fallback must not.
1917 // It also pins that the fallback is O(entities-in-layer), never O(store).
1918 write(
1919 root,
1920 "sources/proposals/index.jsonl",
1921 &jsonl_line(
1922 "sources/proposals/leak.md",
1923 "proposal",
1924 "cross-layer decoy",
1925 "",
1926 ),
1927 );
1928 let store = open(&dir);
1929 let recs = store.find_by_type("proposal").unwrap();
1930 assert_eq!(
1931 recs.len(),
1932 1,
1933 "only the records-layer proposal, not the sources decoy"
1934 );
1935 assert_eq!(recs[0].summary, "Q3 proposal");
1936 assert_eq!(recs[0].path, PathBuf::from("records/proposals/p1.md"));
1937 }
1938
1939 #[test]
1940 fn find_by_type_canonical_absent_does_not_read_other_layers() {
1941 let dir = empty_store();
1942 let root = dir.path();
1943 // `email`'s canonical folder is `sources/emails` (layer Sources). No
1944 // sidecar there yet, so `find_by_type("email")` falls back — but only
1945 // within the Sources layer. A populated sidecar in the Records layer
1946 // must never be touched: the fallback is layer-bounded, not store-wide.
1947 // Under the old `read_all_type_indexes_in(None)` fallback this records
1948 // sidecar would have been read and filtered (wasted O(store) I/O); now
1949 // it is outside the walk root entirely.
1950 write(
1951 root,
1952 "records/contacts/index.jsonl",
1953 &jsonl_line("records/contacts/sarah.md", "contact", "Sarah", ""),
1954 );
1955 let store = open(&dir);
1956 // No email anywhere ⇒ empty, and the records layer was not in scope.
1957 assert!(store.find_by_type("email").unwrap().is_empty());
1958 }
1959
1960 #[test]
1961 fn find_by_where_matches_typed_columns_and_flat_fields() {
1962 let dir = empty_store();
1963 let root = dir.path();
1964 write(
1965 root,
1966 "records/expenses/index.jsonl",
1967 &(jsonl_line(
1968 "records/expenses/a.md",
1969 "expense",
1970 "lunch",
1971 ",\"vendor\":\"acme\",\"tags\":[\"meals\"]",
1972 ) + &jsonl_line(
1973 "records/expenses/b.md",
1974 "expense",
1975 "taxi",
1976 ",\"vendor\":\"yellow\"",
1977 )),
1978 );
1979 write(
1980 root,
1981 "records/contacts/index.jsonl",
1982 &jsonl_line(
1983 "records/contacts/sarah.md",
1984 "contact",
1985 "Sarah",
1986 ",\"tags\":[\"customer\"]",
1987 ),
1988 );
1989 let store = open(&dir);
1990
1991 // Flat field in `fields`.
1992 let by_vendor = store.find_by_where("vendor", "acme").unwrap();
1993 assert_eq!(by_vendor.len(), 1);
1994 assert_eq!(by_vendor[0].path, PathBuf::from("records/expenses/a.md"));
1995
1996 // Typed column: type (spans both expense records).
1997 assert_eq!(store.find_by_where("type", "expense").unwrap().len(), 2);
1998
1999 // Typed list column: tags membership.
2000 let customers = store.find_by_where("tags", "customer").unwrap();
2001 assert_eq!(customers.len(), 1);
2002 assert_eq!(
2003 customers[0].path,
2004 PathBuf::from("records/contacts/sarah.md")
2005 );
2006
2007 // No match → empty.
2008 assert!(store.find_by_where("vendor", "nobody").unwrap().is_empty());
2009 }
2010
2011 #[test]
2012 fn find_by_where_matches_timestamps_across_rfc3339_spellings() {
2013 let dir = empty_store();
2014 let root = dir.path();
2015 // db.md files most commonly carry the `Z` UTC spelling. The index.jsonl
2016 // serialized from such a file preserves it verbatim.
2017 write(
2018 root,
2019 "records/meetings/index.jsonl",
2020 "{\"path\":\"records/meetings/kickoff.md\",\"type\":\"meeting\",\
2021\"summary\":\"kickoff\",\"created\":\"2026-05-01T00:00:00Z\",\
2022\"updated\":\"2026-05-02T09:30:00-07:00\"}\n",
2023 );
2024 let store = open(&dir);
2025
2026 // The exact value an agent reads out of the file (`Z` form) must match.
2027 let by_z = store
2028 .find_by_where("created", "2026-05-01T00:00:00Z")
2029 .unwrap();
2030 assert_eq!(by_z.len(), 1);
2031 assert_eq!(by_z[0].path, PathBuf::from("records/meetings/kickoff.md"));
2032
2033 // The equivalent explicit-offset spelling of the same instant matches too.
2034 assert_eq!(
2035 store
2036 .find_by_where("created", "2026-05-01T00:00:00+00:00")
2037 .unwrap()
2038 .len(),
2039 1
2040 );
2041
2042 // A non-UTC stored value matches both its own offset spelling and the
2043 // same instant expressed as `Z` (instant comparison, not string compare).
2044 assert_eq!(
2045 store
2046 .find_by_where("updated", "2026-05-02T09:30:00-07:00")
2047 .unwrap()
2048 .len(),
2049 1
2050 );
2051 assert_eq!(
2052 store
2053 .find_by_where("updated", "2026-05-02T16:30:00Z")
2054 .unwrap()
2055 .len(),
2056 1
2057 );
2058
2059 // A different instant does not match.
2060 assert!(store
2061 .find_by_where("created", "2026-05-01T00:00:01Z")
2062 .unwrap()
2063 .is_empty());
2064 // A non-RFC3339 query value never matches a real timestamp.
2065 assert!(store
2066 .find_by_where("created", "2026-05-01")
2067 .unwrap()
2068 .is_empty());
2069 }
2070
2071 #[test]
2072 fn find_by_where_in_layer_reads_only_that_layers_sidecars() {
2073 // The O(entities-in-layer) contract: a layer-scoped where read must walk
2074 // ONLY the named layer's subtree. Proven structurally — a *malformed*
2075 // sidecar in another layer would make `read_type_index` error if it were
2076 // read, so a scoped read that succeeds (and excludes that record) is
2077 // proof the other layer's I/O never happened.
2078 let dir = empty_store();
2079 let root = dir.path();
2080 write(
2081 root,
2082 "records/companies/index.jsonl",
2083 &jsonl_line(
2084 "records/companies/acme.md",
2085 "company",
2086 "Acme",
2087 ",\"domain\":\"acme.com\"",
2088 ),
2089 );
2090 // Same field/value in the sources layer — but the sidecar is corrupt.
2091 write(
2092 root,
2093 "sources/emails/index.jsonl",
2094 "{ this is not valid json and would error if read }\n",
2095 );
2096 let store = open(&dir);
2097
2098 // Scoped to records: the corrupt sources sidecar is out of scope, so the
2099 // read succeeds and returns only the records-layer match.
2100 let in_records = store
2101 .find_by_where_in("domain", "acme.com", Some(Layer::Records))
2102 .expect("a records-scoped read must not touch the sources sidecar");
2103 assert_eq!(
2104 rels(
2105 &in_records
2106 .iter()
2107 .map(|r| r.path.clone())
2108 .collect::<Vec<_>>()
2109 ),
2110 vec!["records/companies/acme.md".to_string()]
2111 );
2112
2113 // The store-wide read DOES reach the corrupt sidecar and surfaces it as
2114 // a parse error — confirming the corrupt file is genuinely in the tree
2115 // and that only the layer scope spares it.
2116 let store_wide = store.find_by_where("domain", "acme.com");
2117 assert!(
2118 matches!(store_wide, Err(StoreError::BadTypeIndex { .. })),
2119 "unscoped read walks every layer and hits the corrupt sidecar"
2120 );
2121
2122 // Scoping to the layer that holds only the corrupt sidecar still errors
2123 // (the scope includes it), proving the scope is a real subtree bound and
2124 // not a silent "skip anything that fails".
2125 let in_sources = store.find_by_where_in("domain", "acme.com", Some(Layer::Sources));
2126 assert!(matches!(in_sources, Err(StoreError::BadTypeIndex { .. })));
2127 }
2128
2129 #[test]
2130 fn find_by_where_in_missing_layer_is_empty_not_an_error() {
2131 // A layer-scoped read over a layer folder that does not exist yet must
2132 // return empty (mirrors `walk_layer`'s missing-dir guard), never a walk
2133 // error from `ignore` over a nonexistent path.
2134 let dir = empty_store();
2135 let root = dir.path();
2136 write(
2137 root,
2138 "records/contacts/index.jsonl",
2139 &jsonl_line(
2140 "records/contacts/sarah.md",
2141 "contact",
2142 "Sarah",
2143 ",\"city\":\"denver\"",
2144 ),
2145 );
2146 let store = open(&dir);
2147
2148 // `wiki/` was never created.
2149 let in_wiki = store
2150 .find_by_where_in("city", "denver", Some(Layer::Wiki))
2151 .expect("missing layer subtree is empty, not an error");
2152 assert!(in_wiki.is_empty());
2153
2154 // Same query scoped to the layer that has the record still finds it.
2155 let in_records = store
2156 .find_by_where_in("city", "denver", Some(Layer::Records))
2157 .unwrap();
2158 assert_eq!(in_records.len(), 1);
2159 }
2160
2161 // ── abs_path / rel_path ──────────────────────────────────────────────────
2162
2163 #[test]
2164 fn abs_and_rel_path_roundtrip() {
2165 let dir = empty_store();
2166 let store = open(&dir);
2167 let rel = Path::new("records/contacts/sarah.md");
2168 let abs = store.abs_path(rel);
2169 assert_eq!(abs, dir.path().join(rel));
2170 assert_eq!(store.rel_path(&abs).as_deref(), Some(rel));
2171
2172 // An absolute path is passed through unchanged by abs_path.
2173 assert_eq!(store.abs_path(&abs), abs);
2174
2175 // A path outside the store has no store-relative form.
2176 assert_eq!(store.rel_path(Path::new("/somewhere/else.md")), None);
2177 }
2178
2179 // ── infer_type_from_path (inverse of default_type_folder) ────────────────
2180
2181 #[test]
2182 fn infer_type_maps_every_recognized_folder_back_to_its_type() {
2183 let cases = [
2184 ("sources/emails/x.md", "email"),
2185 ("sources/transcripts/x.md", "transcript"),
2186 ("sources/docs/x.md", "pdf-source"),
2187 ("records/contacts/x.md", "contact"),
2188 ("records/companies/x.md", "company"),
2189 ("records/expenses/x.md", "expense"),
2190 ("records/meetings/x.md", "meeting"),
2191 ("records/decisions/x.md", "decision"),
2192 ("records/invoices/x.md", "invoice"),
2193 // Any wiki sub-folder infers `wiki-page` regardless of the topic name.
2194 ("wiki/topics/x.md", "wiki-page"),
2195 ("wiki/pricing/x.md", "wiki-page"),
2196 ];
2197 for (path, expected) in cases {
2198 assert_eq!(
2199 infer_type_from_path(Path::new(path)).as_deref(),
2200 Some(expected),
2201 "path {path} should infer type {expected}"
2202 );
2203 }
2204 }
2205
2206 #[test]
2207 fn infer_type_round_trips_with_default_type_folder() {
2208 // The canonical invariant: inference is the inverse of the forward map.
2209 // Every recognized type, routed through `default_type_folder` and then
2210 // back through `infer_type_from_path`, must return the original type.
2211 // `wiki-page` is the one many-to-one case (every topic folder maps back
2212 // to `wiki-page`), so its forward folder still round-trips.
2213 let recognized = [
2214 "email",
2215 "transcript",
2216 "pdf-source",
2217 "contact",
2218 "company",
2219 "expense",
2220 "meeting",
2221 "decision",
2222 "invoice",
2223 "wiki-page",
2224 ];
2225 for type_ in recognized {
2226 let folder = default_type_folder(type_);
2227 let file = folder.join("x.md");
2228 assert_eq!(
2229 infer_type_from_path(&file).as_deref(),
2230 Some(type_),
2231 "recognized type {type_} (folder {folder:?}) must round-trip"
2232 );
2233 }
2234 }
2235
2236 #[test]
2237 fn infer_type_round_trips_custom_types_verbatim_no_singularization() {
2238 // Regression guard for the CLI/core divergence: `default_type_folder`'s
2239 // unrecognized fallback is the BARE type name (`task → records/task`,
2240 // `tasks → records/tasks`). Inference must NOT singularize, or a custom
2241 // type would not round-trip (e.g. `records/tasks` → `task` would clash
2242 // with `default_type_folder("task") → records/task`).
2243 for custom in ["task", "tasks", "playbook", "process", "okrs", "ticket"] {
2244 let folder = default_type_folder(custom);
2245 assert_eq!(folder, PathBuf::from("records").join(custom));
2246 let file = folder.join("x.md");
2247 assert_eq!(
2248 infer_type_from_path(&file).as_deref(),
2249 Some(custom),
2250 "custom type {custom} must round-trip verbatim (no singularization)"
2251 );
2252 }
2253
2254 // The specific case named in the finding: a plural custom folder keeps
2255 // its trailing `s`; it is NOT singularized to `task`.
2256 assert_eq!(
2257 infer_type_from_path(Path::new("records/tasks/x.md")).as_deref(),
2258 Some("tasks"),
2259 "records/tasks must infer `tasks`, not `task`"
2260 );
2261 }
2262
2263 #[test]
2264 fn infer_type_requires_three_component_layer_folder_file_shape() {
2265 // Fewer than 3 components: a file directly under a layer has no
2266 // type-folder, so inference yields None (matches the old CLI contract).
2267 assert_eq!(infer_type_from_path(Path::new("records/x.md")), None);
2268 assert_eq!(infer_type_from_path(Path::new("sources/x.md")), None);
2269 assert_eq!(infer_type_from_path(Path::new("wiki/x.md")), None);
2270 assert_eq!(infer_type_from_path(Path::new("x.md")), None);
2271 // Unknown leading layer is never inferred.
2272 assert_eq!(infer_type_from_path(Path::new("foo/bar/x.md")), None);
2273 // Deeper paths still infer from the first type-folder segment (e.g. a
2274 // sharded record under records/expenses/2026/05/x.md).
2275 assert_eq!(
2276 infer_type_from_path(Path::new("records/expenses/2026/05/x.md")).as_deref(),
2277 Some("expense"),
2278 );
2279 }
2280}