Skip to main content

dbmd_core/
index.rs

1//! `index` — the hierarchical content catalog.
2//!
3//! A uniform three-level tree: root + per-layer + per-type-folder. **Two
4//! artifacts per type-folder:** the human `index.md` (capped 500, recency
5//! browse) and the machine `index.jsonl` (complete, structured — one JSON
6//! object per file). Both read `summary` + key frontmatter fields + links
7//! directly from each file — there is no extraction logic here.
8//!
9//! **Maintained write-through** by the write commands ([`Index::on_write`] /
10//! [`Index::on_rename`] / [`Index::on_remove`] — the loop path, O(changed), no
11//! store walk); [`Index::rebuild_all`] is the from-scratch SWEEP repair.
12//!
13//! **Key invariant:** write-through must produce a byte-identical `index.md`
14//! and (post-compaction) `index.jsonl` to a full [`Index::rebuild_all`] over
15//! the same end state — the loop path can never drift from the repair path.
16//!
17//! # Implementation notes (deviations the reader should know)
18//!
19//! - **Self-contained, by design.** This module does its own shard-aware folder
20//!   walk, its own minimal frontmatter read, and its own atomic write, using
21//!   only `store.root` (a public field) and the `serde_norway` / `serde_json` /
22//!   `chrono` / `walkdir` crates rather than routing through the sibling
23//!   `store`/`parser` helpers ([`Store::walk_type_folder`],
24//!   [`Store::recent_in_type_folder`], [`parser::read_file`], …). The index has
25//!   to stamp a *deterministic* `updated:` and emit a *canonical, compacted*
26//!   `index.jsonl` (see the two notes below); keeping the read/walk/write local
27//!   is what makes the byte-identity invariant a true byte comparison, free of
28//!   any incidental formatting the shared readers might introduce. The public
29//!   signatures in `lib.rs` are untouched.
30//! - **Deterministic `updated:` on the index files themselves.** An index's own
31//!   `updated` frontmatter is derived as the max `updated` over the files it
32//!   catalogs (max over children for root/layer) — NOT wall-clock-now. This is
33//!   what makes the byte-identity invariant a *true* byte comparison: a
34//!   write-through write and a `rebuild_all` over the same end state stamp the
35//!   same value. (The SPEC's rendered examples show a wall-clock-looking value;
36//!   the conventions list only requires `updated: <RFC3339>`, and the
37//!   property-tested invariant dominates.)
38//! - **`index.jsonl` is always compacted.** Write-through rewrites the affected
39//!   type-folder's jsonl in canonical form (one current line per path, recency
40//!   order) rather than appending superseded/tombstone lines, so the jsonl is
41//!   byte-identical to `rebuild_all` *immediately* (a strictly stronger
42//!   guarantee than the SPEC's "post-compaction"). This keeps the loop cost at
43//!   one sidecar read + one rewrite per touched type-folder — O(folder), the
44//!   sanctioned loop primitive, never a whole-`Store::walk`.
45//! - **Root/layer entry styling** follows plan §index (`(N)` numeric counts;
46//!   layer headings in the root carry the layer's total count) which is more
47//!   specific than the SPEC's illustrative `(42 files)` prose example. Type
48//!   folders are listed alphabetically (a deterministic order a derived artifact
49//!   needs); `scope: type-folder` follows the conventions list, not the one
50//!   SPEC example that wrote `scope: folder`.
51
52use std::collections::BTreeMap;
53use std::fs;
54use std::io::Write as _;
55use std::path::{Path, PathBuf};
56
57use chrono::{DateTime, FixedOffset, SecondsFormat};
58use serde::{Deserialize, Serialize};
59use serde_json::Value;
60
61use crate::parser::FolderMeta;
62use crate::store::{Layer, Store};
63
64/// The browse-view cap for a type-folder `index.md`.
65const MD_CAP: usize = 500;
66
67/// Placeholder summary for a content file that has no `summary` frontmatter.
68/// The index never invents a real summary — that is `dbmd fm init`'s job; this
69/// marker is what `dbmd validate` keys off (`INDEX`-class issue).
70const MISSING_SUMMARY: &str = "(no summary)";
71
72/// The root `index.md` H1.
73const ROOT_TITLE: &str = "Knowledge base index";
74
75/// Which level of the catalog an [`Index`] represents.
76#[derive(Debug, Clone, PartialEq, Eq)]
77pub enum IndexLevel {
78    /// The store-wide root `index.md` (layers + per-type counts).
79    Root,
80    /// A layer `index.md` (every type-folder under one layer).
81    Layer(Layer),
82    /// A type-folder `index.md` + `index.jsonl` (every file in the folder).
83    TypeFolder(PathBuf),
84}
85
86/// One record in a type-folder's `index.jsonl` — the complete, structured twin
87/// of a single `index.md` browse entry.
88///
89/// `tags` are the document's flat labels; `links` are its concept/relationship
90/// wiki-link targets. Both are copied verbatim from the file — never inferred.
91/// `fields` holds the remaining type-specific frontmatter so the structured
92/// query path can filter on any key without opening the file.
93#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
94pub struct IndexRecord {
95    /// Store-relative path of the file (the upsert key; last-write-wins).
96    /// Serialized with forward slashes regardless of OS (see [`path_serde`]) so
97    /// the `index.jsonl` catalog is byte-portable across platforms.
98    #[serde(with = "path_serde")]
99    pub path: PathBuf,
100    /// The file's `type`.
101    #[serde(rename = "type")]
102    pub type_: String,
103    /// The file's `summary`.
104    pub summary: String,
105    /// The file's flat `tags`.
106    #[serde(default)]
107    pub tags: Vec<String>,
108    /// The file's concept/relationship wiki-link targets (store-relative).
109    #[serde(default)]
110    pub links: Vec<String>,
111    /// `created` timestamp.
112    pub created: Option<DateTime<FixedOffset>>,
113    /// `updated` timestamp (the recency key for the `index.md` cap order).
114    pub updated: Option<DateTime<FixedOffset>>,
115    /// Remaining type-specific frontmatter fields, verbatim.
116    #[serde(flatten)]
117    pub fields: BTreeMap<String, Value>,
118}
119
120/// A built (or being-built) catalog for one [`IndexLevel`], with both rendered
121/// artifacts available. Pure data until written via [`Index::write_level`].
122#[derive(Debug, Clone, PartialEq)]
123pub struct Index {
124    /// Which level this catalog is for.
125    pub level: IndexLevel,
126    /// The complete record set for this level (type-folder level; empty for
127    /// root/layer rollups, which carry only counts).
128    pub records: Vec<IndexRecord>,
129    /// Per-child counts for root/layer rollups (child path → file count).
130    pub child_counts: BTreeMap<PathBuf, usize>,
131}
132
133impl Index {
134    /// Build a type-folder catalog by aggregating across date-shards, producing
135    /// both artifacts. `index.md` selection is recency (updated desc, ties by
136    /// path asc; cap 500 with a `## More` footer over the cap); `index.jsonl`
137    /// holds every file. A file missing `summary` gets a placeholder + a
138    /// validate-detectable issue (the index never invents summaries).
139    pub fn build_type_folder(store: &Store, type_folder: &Path) -> crate::Result<Index> {
140        let rel = normalize_rel(type_folder);
141        let abs = store.root.join(&rel);
142        let mut records = Vec::new();
143        for file_abs in walk_type_folder_files(&abs) {
144            let rel_path =
145                rel_to_store(&store.root, &file_abs).expect("walked file is under the store root");
146            // Abort the build on a malformed file rather than skip it. A skipped
147            // file would still be a content member the validator requires to be
148            // catalogued (`validate::walk_content_files` enumerates by filename,
149            // not by parseability), so silently dropping it would leave the store
150            // in a permanently invalid state (`INDEX_MISSING_ENTRY` /
151            // `INDEX_JSONL_DESYNC` that no rebuild can clear) and would desync the
152            // rollups (`build_layer`/`build_root` count the raw `.md` files). The
153            // loud `?` is the right outcome: `cleanup` now preserves the prior
154            // canonical sidecars (`min_depth(2)`), so an aborted rebuild leaves
155            // the existing catalogs intact and the operator a clear error naming
156            // the file to fix — never a destroyed or silently-wrong index.
157            records.push(record_from_file(&file_abs, rel_path)?);
158        }
159        sort_records(&mut records);
160        Ok(Index {
161            level: IndexLevel::TypeFolder(rel),
162            records,
163            child_counts: BTreeMap::new(),
164        })
165    }
166
167    /// Build a layer catalog: every non-empty type-folder under the layer with
168    /// `(N)` counts and a newest-file `summary` preview (≤ 80 chars).
169    pub fn build_layer(store: &Store, layer: Layer) -> crate::Result<Index> {
170        let mut child_counts = BTreeMap::new();
171        for tf in type_folders_in_layer(store, layer) {
172            let abs = store.root.join(&tf);
173            let n = walk_type_folder_files(&abs).len();
174            if n > 0 {
175                child_counts.insert(tf, n);
176            }
177        }
178        Ok(Index {
179            level: IndexLevel::Layer(layer),
180            records: Vec::new(),
181            child_counts,
182        })
183    }
184
185    /// Build the store-wide root catalog: one heading per non-empty layer with
186    /// total count + bulleted per-type sub-entries with `(N)` counts.
187    pub fn build_root(store: &Store) -> crate::Result<Index> {
188        let mut child_counts = BTreeMap::new();
189        for layer in Layer::all() {
190            for tf in type_folders_in_layer(store, layer) {
191                let abs = store.root.join(&tf);
192                let n = walk_type_folder_files(&abs).len();
193                if n > 0 {
194                    child_counts.insert(tf, n);
195                }
196            }
197        }
198        Ok(Index {
199            level: IndexLevel::Root,
200            records: Vec::new(),
201            child_counts,
202        })
203    }
204
205    /// Render this catalog as a canonical `index.md`.
206    pub fn to_markdown(&self) -> String {
207        match &self.level {
208            IndexLevel::TypeFolder(folder) => self.render_type_folder_md(folder),
209            IndexLevel::Layer(layer) => self.render_layer_md(*layer),
210            IndexLevel::Root => self.render_root_md(),
211        }
212    }
213
214    /// Render this type-folder catalog as the complete `index.jsonl` (one JSON
215    /// object per file, stable key order so diffs stay minimal). Type-folder
216    /// level only — root and layer stay markdown rollups.
217    pub fn to_jsonl(&self) -> String {
218        let mut out = String::new();
219        for rec in &self.records {
220            // The record type derives a deterministic, sorted key order
221            // (declared fields first, then the flattened `fields` BTreeMap).
222            let line = serde_json::to_string(rec).expect("IndexRecord serializes");
223            out.push_str(&line);
224            out.push('\n');
225        }
226        out
227    }
228
229    // ── rendering helpers ────────────────────────────────────────────────
230
231    fn render_type_folder_md(&self, folder: &Path) -> String {
232        let folder_disp = path_to_unix(folder);
233        let updated = max_updated(self.records.iter().map(|r| r.updated.as_ref()));
234        let mut s = String::new();
235        s.push_str("---\n");
236        s.push_str("type: index\n");
237        s.push_str("scope: type-folder\n");
238        s.push_str(&format!("folder: {folder_disp}\n"));
239        if let Some(ts) = updated {
240            s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
241        }
242        s.push_str("---\n\n");
243        s.push_str(&format!("# {folder_disp}\n\n"));
244
245        let shown = self.records.len().min(MD_CAP);
246        for rec in self.records.iter().take(shown) {
247            s.push_str(&format_md_entry(rec));
248            s.push('\n');
249        }
250
251        if self.records.len() > MD_CAP {
252            let type_ = self.records.first().map(|r| r.type_.as_str()).unwrap_or("");
253            let layer = folder
254                .components()
255                .next()
256                .and_then(|c| c.as_os_str().to_str())
257                .unwrap_or("");
258            s.push('\n');
259            s.push_str(&more_footer(self.records.len(), type_, layer));
260        }
261        s
262    }
263
264    /// Store-less layer rollup: counts only, no preview / no derived `updated`
265    /// (a layer index needs each child's on-disk jsonl for those — see
266    /// [`render_layer_md_with_store`], the canonical path every disk write
267    /// uses). This pure-data render is structurally identical sans preview.
268    fn render_layer_md(&self, layer: Layer) -> String {
269        let layer_dir = layer_dir_name(layer);
270        let mut s = String::new();
271        s.push_str("---\n");
272        s.push_str("type: index\n");
273        s.push_str("scope: layer\n");
274        s.push_str(&format!("folder: {layer_dir}\n"));
275        s.push_str("---\n\n");
276        s.push_str(&format!("# {layer_dir}\n\n"));
277        for (tf, n) in &self.child_counts {
278            let tf_unix = path_to_unix(tf);
279            let display = capitalize(folder_basename(tf));
280            s.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n"));
281        }
282        s
283    }
284
285    /// Store-less root rollup: counts only (the canonical disk render adds a
286    /// derived `updated` — see [`render_root_md_with_store`]).
287    fn render_root_md(&self) -> String {
288        let mut s = String::new();
289        s.push_str("---\n");
290        s.push_str("type: index\n");
291        s.push_str("scope: root\n");
292        s.push_str("---\n\n");
293        s.push_str(&format!("# {ROOT_TITLE}\n"));
294        for layer in Layer::all() {
295            let layer_dir = layer_dir_name(layer);
296            let prefix = format!("{layer_dir}/");
297            let children: Vec<(&PathBuf, &usize)> = self
298                .child_counts
299                .iter()
300                .filter(|(tf, _)| path_to_unix(tf).starts_with(&prefix))
301                .collect();
302            if children.is_empty() {
303                continue;
304            }
305            let total: usize = children.iter().map(|(_, n)| **n).sum();
306            s.push('\n');
307            s.push_str(&format!("## {} ({total})\n", capitalize(layer_dir)));
308            for (tf, n) in children {
309                let tf_unix = path_to_unix(tf);
310                let display = capitalize(folder_basename(tf));
311                s.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n"));
312            }
313        }
314        s
315    }
316}
317
318// ─────────────────────────────────────────────────────────────────────────
319// Write-through + sweep (free functions on the impl block).
320// ─────────────────────────────────────────────────────────────────────────
321
322impl Index {
323    /// **Write-through (loop, O(changed)).** Upsert a new/updated content file.
324    /// Reads the affected type-folder's `index.jsonl` (the sanctioned per-folder
325    /// sidecar read — never a whole-store walk), applies the change, and
326    /// atomically rewrites that folder's `index.md` + `index.jsonl` plus the
327    /// parent layer + root rollups so the artifacts equal a `rebuild_all` over
328    /// the same end state.
329    pub fn on_write(store: &Store, file: &Path) -> crate::Result<()> {
330        let file_rel = normalize_rel(file);
331        // The generated catalog files are not content — never upsert one into
332        // itself. `build_type_folder`'s walk already excludes `index.md`
333        // (`walk_type_folder_files`); the loop path must apply the same
334        // exclusion or editing `index.md` via `fm set` inserts a phantom
335        // self-row, inflating every `(N)` count and breaking the
336        // write-through == rebuild byte-identity invariant.
337        if is_index_artifact(&file_rel) {
338            return Ok(());
339        }
340        let file_abs = store.root.join(&file_rel);
341        let folder = type_folder_of(&file_rel)
342            .ok_or_else(|| bad_index(&file_rel, "file is not inside a layer/type-folder"))?;
343        let record = record_from_file(&file_abs, file_rel.clone())?;
344
345        // Serialize the sidecar read-modify-write so concurrent sanctioned
346        // writes to this folder don't clobber each other's rows (lost update).
347        let _lock = FolderLock::acquire(&store.root.join(&folder));
348        let mut records = read_jsonl_records(&store.root.join(&folder).join("index.jsonl"))?;
349        records.retain(|r| r.path != record.path);
350        records.push(record);
351        sort_records(&mut records);
352
353        write_type_folder_artifacts(store, &folder, &records)?;
354        update_parents(store, &folder)?;
355        Ok(())
356    }
357
358    /// **Write-through (loop, O(changed)).** Move a file's entry between
359    /// type-folder indexes (or within, if the same folder) in both `index.md`
360    /// and `index.jsonl`, fixing counts on both sides.
361    pub fn on_rename(store: &Store, old: &Path, new: &Path) -> crate::Result<()> {
362        let old_rel = normalize_rel(old);
363        let new_rel = normalize_rel(new);
364        // Index artifacts are generated, not catalogued — a rename of/into one
365        // is not a content move (same reasoning as `on_write`). Skip rather than
366        // insert a phantom self-row.
367        if is_index_artifact(&old_rel) || is_index_artifact(&new_rel) {
368            return Ok(());
369        }
370        let old_folder = type_folder_of(&old_rel)
371            .ok_or_else(|| bad_index(&old_rel, "source is not inside a layer/type-folder"))?;
372        let new_folder = type_folder_of(&new_rel)
373            .ok_or_else(|| bad_index(&new_rel, "target is not inside a layer/type-folder"))?;
374
375        // Serialize the sidecar read-modify-write(s). For a cross-folder rename,
376        // lock BOTH folders, always in sorted order, so two renames touching the
377        // same pair can't deadlock. Held for the whole operation via RAII.
378        let _locks = lock_folders(store, &old_folder, &new_folder);
379
380        // Drop from the old folder.
381        let mut old_records =
382            read_jsonl_records(&store.root.join(&old_folder).join("index.jsonl"))?;
383        old_records.retain(|r| r.path != old_rel);
384
385        if old_folder == new_folder {
386            // Same folder: re-read the (now-renamed) file and upsert.
387            let record = record_from_file(&store.root.join(&new_rel), new_rel.clone())?;
388            old_records.retain(|r| r.path != record.path);
389            old_records.push(record);
390            sort_records(&mut old_records);
391            write_type_folder_artifacts(store, &old_folder, &old_records)?;
392            update_parents(store, &old_folder)?;
393            return Ok(());
394        }
395
396        // Cross-folder: write the trimmed old folder (or drop its indexes if
397        // now empty), then upsert into the new folder.
398        sort_records(&mut old_records);
399        write_type_folder_artifacts(store, &old_folder, &old_records)?;
400
401        let record = record_from_file(&store.root.join(&new_rel), new_rel.clone())?;
402        let mut new_records =
403            read_jsonl_records(&store.root.join(&new_folder).join("index.jsonl"))?;
404        new_records.retain(|r| r.path != record.path);
405        new_records.push(record);
406        sort_records(&mut new_records);
407        write_type_folder_artifacts(store, &new_folder, &new_records)?;
408
409        update_parents(store, &old_folder)?;
410        update_parents(store, &new_folder)?;
411        Ok(())
412    }
413
414    /// **Write-through (loop, O(changed)).** Drop a file's entry from both
415    /// `index.md` and `index.jsonl`; decrement counts; if the browse view drops
416    /// below the cap, the next-most-recent is already present in the complete
417    /// jsonl record set and re-renders into the md automatically.
418    pub fn on_remove(store: &Store, file: &Path) -> crate::Result<()> {
419        let file_rel = normalize_rel(file);
420        // Removing a generated catalog artifact is not a content removal; it has
421        // no row to drop (it was never catalogued). Skip, mirroring `on_write`.
422        if is_index_artifact(&file_rel) {
423            return Ok(());
424        }
425        let folder = type_folder_of(&file_rel)
426            .ok_or_else(|| bad_index(&file_rel, "file is not inside a layer/type-folder"))?;
427        // Serialize the sidecar read-modify-write (see `on_write`).
428        let _lock = FolderLock::acquire(&store.root.join(&folder));
429        let mut records = read_jsonl_records(&store.root.join(&folder).join("index.jsonl"))?;
430        let before = records.len();
431        records.retain(|r| r.path != file_rel);
432        if records.len() == before {
433            // Nothing to remove; still normalize the folder + parents so the
434            // artifacts stay canonical.
435        }
436        sort_records(&mut records);
437        write_type_folder_artifacts(store, &folder, &records)?;
438        update_parents(store, &folder)?;
439        Ok(())
440    }
441
442    /// **SWEEP repair.** Walk the store once and atomically (re)write root +
443    /// every non-empty layer + every non-empty type-folder `index.md` and
444    /// `index.jsonl` (compacting the jsonl). Also runs [`Index::cleanup`].
445    pub fn rebuild_all(store: &Store) -> crate::Result<()> {
446        Index::cleanup(store)?;
447        for layer in Layer::all() {
448            for tf in type_folders_in_layer(store, layer) {
449                let idx = Index::build_type_folder(store, &tf)?;
450                if idx.records.is_empty() {
451                    continue;
452                }
453                write_type_folder_artifacts(store, &tf, &idx.records)?;
454            }
455            let layer_idx = Index::build_layer(store, layer)?;
456            let layer_index_md = store.root.join(layer_dir_name(layer)).join("index.md");
457            if layer_idx.child_counts.is_empty() {
458                remove_if_exists(&layer_index_md)?;
459            } else {
460                write_atomic(
461                    &layer_index_md,
462                    render_layer_md_with_store(store, &layer_idx),
463                )?;
464            }
465        }
466        let root_idx = Index::build_root(store)?;
467        let root_index_md = store.root.join("index.md");
468        if root_idx.child_counts.is_empty() {
469            remove_if_exists(&root_index_md)?;
470        } else {
471            write_atomic(&root_index_md, render_root_md_with_store(store, &root_idx))?;
472        }
473        Ok(())
474    }
475
476    /// Rebuild ONE type-folder's `index.md`/`index.jsonl` from a fresh walk, then
477    /// cascade the new child count up to the layer and root rollups — so a
478    /// scoped `dbmd index rebuild --folder` leaves the hierarchy consistent,
479    /// exactly like `rebuild_all` and the loop-path `on_write` already do.
480    /// (Writing only the folder, as the CLI used to, left stale layer/root
481    /// counts that `validate` would then flag as an index desync.)
482    pub fn rebuild_folder(store: &Store, folder: &Path) -> crate::Result<()> {
483        Self::write_level(store, &IndexLevel::TypeFolder(folder.to_path_buf()))?;
484        update_parents(store, folder)
485    }
486
487    /// Atomically write a single level's artifact(s) to disk.
488    pub fn write_level(store: &Store, level: &IndexLevel) -> crate::Result<()> {
489        match level {
490            IndexLevel::TypeFolder(folder) => {
491                let idx = Index::build_type_folder(store, folder)?;
492                if idx.records.is_empty() {
493                    remove_if_exists(&store.root.join(folder).join("index.md"))?;
494                    remove_if_exists(&store.root.join(folder).join("index.jsonl"))?;
495                } else {
496                    write_type_folder_artifacts(store, folder, &idx.records)?;
497                }
498            }
499            IndexLevel::Layer(layer) => {
500                let idx = Index::build_layer(store, *layer)?;
501                let p = store.root.join(layer_dir_name(*layer)).join("index.md");
502                if idx.child_counts.is_empty() {
503                    remove_if_exists(&p)?;
504                } else {
505                    write_atomic(&p, render_layer_md_with_store(store, &idx))?;
506                }
507            }
508            IndexLevel::Root => {
509                let idx = Index::build_root(store)?;
510                let p = store.root.join("index.md");
511                if idx.child_counts.is_empty() {
512                    remove_if_exists(&p)?;
513                } else {
514                    write_atomic(&p, render_root_md_with_store(store, &idx))?;
515                }
516            }
517        }
518        Ok(())
519    }
520
521    /// Render the generated indexes to a string with `--- <path> ---`
522    /// separators instead of writing them (`--dry-run`).
523    pub fn render_dry_run(store: &Store, level: &IndexLevel) -> crate::Result<String> {
524        let mut out = String::new();
525        match level {
526            IndexLevel::TypeFolder(folder) => {
527                let idx = Index::build_type_folder(store, folder)?;
528                let md_path = path_to_unix(&folder.join("index.md"));
529                let jsonl_path = path_to_unix(&folder.join("index.jsonl"));
530                out.push_str(&format!("--- {md_path} ---\n"));
531                out.push_str(&idx.to_markdown());
532                out.push_str(&format!("--- {jsonl_path} ---\n"));
533                out.push_str(&idx.to_jsonl());
534            }
535            IndexLevel::Layer(layer) => {
536                let idx = Index::build_layer(store, *layer)?;
537                let md_path = format!("{}/index.md", layer_dir_name(*layer));
538                out.push_str(&format!("--- {md_path} ---\n"));
539                out.push_str(&render_layer_md_with_store(store, &idx));
540            }
541            IndexLevel::Root => {
542                let idx = Index::build_root(store)?;
543                out.push_str("--- index.md ---\n");
544                out.push_str(&render_root_md_with_store(store, &idx));
545            }
546        }
547        Ok(out)
548    }
549
550    /// Cleanup pass (part of [`Index::rebuild_all`]): delete `index.md` /
551    /// `index.jsonl` in non-canonical folders (date-shards that should carry
552    /// none). Symmetric with index creation.
553    ///
554    /// **Only deletes generated catalog artifacts, never user content.** Two
555    /// guards keep this from eating data:
556    /// - `min_depth(2)` so the walk starts *below* the type-folder root — the
557    ///   canonical `<type-folder>/index.md` + `index.jsonl` are never targeted
558    ///   here (they are rewritten by the per-folder builders, or removed only
559    ///   when the folder is genuinely empty, in the dedicated branch below). The
560    ///   old `min_depth(1)` deleted them up front, so a rebuild aborted by one
561    ///   malformed file left every type-folder catalog destroyed.
562    /// - [`is_deletable_catalog_artifact`] confirms a shard-level `index.md` is
563    ///   an actual generated catalog (or stale/garbage leftover), NOT a content
564    ///   file a user wrote at that name (e.g. `dbmd write …/index.md --type
565    ///   email`, plausible when mirroring a website/doc export). Matching by
566    ///   filename alone silently deleted such records on the next rebuild.
567    pub fn cleanup(store: &Store) -> crate::Result<()> {
568        for layer in Layer::all() {
569            let layer_dir = store.root.join(layer_dir_name(layer));
570            if !layer_dir.is_dir() {
571                continue;
572            }
573            for tf in type_folders_in_layer(store, layer) {
574                let tf_abs = store.root.join(&tf);
575                // Any generated index inside a shard (below the type-folder
576                // root) is non-canonical: delete it. Never touch a user content
577                // file that merely happens to be named index.md.
578                for entry in walkdir::WalkDir::new(&tf_abs)
579                    .min_depth(2)
580                    .into_iter()
581                    .filter_map(|e| e.ok())
582                {
583                    let p = entry.path();
584                    if is_index_artifact(p) && is_deletable_catalog_artifact(p) {
585                        remove_if_exists(p)?;
586                    }
587                }
588                // Empty type-folder → no index at its root either. Same content
589                // guard: an `index.md` here that is actually a user record (the
590                // only file in the folder) is preserved, not deleted.
591                if walk_type_folder_files(&tf_abs).is_empty() {
592                    let md = tf_abs.join("index.md");
593                    if is_deletable_catalog_artifact(&md) {
594                        remove_if_exists(&md)?;
595                    }
596                    remove_if_exists(&tf_abs.join("index.jsonl"))?;
597                }
598            }
599        }
600        Ok(())
601    }
602}
603
604// ─────────────────────────────────────────────────────────────────────────
605// Private free helpers — all self-contained, none call back into Store/parser.
606// ─────────────────────────────────────────────────────────────────────────
607
608/// Write both artifacts for a type-folder, or delete them if the folder is now
609/// empty. The single funnel both write-through and rebuild go through, so their
610/// output is byte-identical by construction.
611fn write_type_folder_artifacts(
612    store: &Store,
613    folder: &Path,
614    records: &[IndexRecord],
615) -> crate::Result<()> {
616    let folder_abs = store.root.join(folder);
617    let md_path = folder_abs.join("index.md");
618    let jsonl_path = folder_abs.join("index.jsonl");
619    if records.is_empty() {
620        remove_if_exists(&md_path)?;
621        remove_if_exists(&jsonl_path)?;
622        return Ok(());
623    }
624    let idx = Index {
625        level: IndexLevel::TypeFolder(folder.to_path_buf()),
626        records: records.to_vec(),
627        child_counts: BTreeMap::new(),
628    };
629    write_atomic(&md_path, idx.to_markdown())?;
630    write_atomic(&jsonl_path, idx.to_jsonl())?;
631    Ok(())
632}
633
634/// Re-render the layer + root rollups that sit above `folder` — the
635/// **loop path**, O(changed). Counts + previews come from the type-folders'
636/// on-disk `index.jsonl` sidecars ([`collect_child_stats`]), NOT from a
637/// content-tree walk: a single write reads one sidecar per type-folder (shared
638/// across the layer and root rollups) — never the millions of files under the
639/// shards. `build_layer` / `build_root` (which *do* walk the content tree) are
640/// reserved for the from-scratch sweeps ([`Index::rebuild_all`],
641/// [`Index::write_level`], [`Index::render_dry_run`]). The result is
642/// byte-identical to those builders because in the loop — exactly as in
643/// `rebuild_all` — every touched folder's jsonl is rewritten before its parents
644/// are rolled up, so the per-folder stat (`count` / `newest`) equals what a
645/// from-scratch walk would compute.
646fn update_parents(store: &Store, folder: &Path) -> crate::Result<()> {
647    // Read every type-folder's sidecar EXACTLY ONCE into a stat cache (`count` +
648    // `newest` record), then render both rollups from the cache. This removed the
649    // old 2–3×-per-write reparse (`child_counts_from_jsonl` for a count, plus
650    // `render_layer_md_with_store` / `render_root_md_with_store` each doing a full
651    // `read_jsonl_records` parse + sort just to take `.first()`); the output stays
652    // byte-identical (`count` == `read_jsonl_records().len()`, `newest` == its
653    // `.first()`).
654    //
655    // COST, stated honestly: this is `O(total catalogued records)` per write, NOT
656    // `O(changed)`. `collect_child_stats` reads and line-parses EVERY type-folder
657    // sidecar in the store to recompute the rollups, so a single high-volume
658    // folder (months of ingested emails) makes an unrelated tiny write scan that
659    // whole sidecar (a ~50× slowdown at ~200k records was measured). The crate's
660    // literal `Store::walk` guard holds — this reads `index.jsonl` sidecars, not
661    // the content tree — but the broader `O(changed)` complexity the loop path
662    // advertises is NOT met here. Restoring true `O(changed)` needs a persisted
663    // per-folder stat cache (or an in-place rollup patch for `on_write`); that is
664    // a deliberate change to the catalog hot path, tracked as a follow-up, not
665    // done inline. Until then, do not describe this op as `O(changed)`.
666    let stats = collect_child_stats(store, &Layer::all())?;
667
668    let layer = folder
669        .components()
670        .next()
671        .and_then(|c| c.as_os_str().to_str())
672        .and_then(layer_from_dir_name);
673    if let Some(layer) = layer {
674        let p = store.root.join(layer_dir_name(layer)).join("index.md");
675        if layer_has_children(&stats, layer) {
676            write_atomic(
677                &p,
678                render_layer_md_from_stats(layer, &stats, &store.config.folders),
679            )?;
680        } else {
681            remove_if_exists(&p)?;
682        }
683    }
684    let rp = store.root.join("index.md");
685    if stats.values().any(|s| s.count > 0) {
686        write_atomic(
687            &rp,
688            render_root_md_from_stats(&stats, &store.config.folders),
689        )?;
690    } else {
691        remove_if_exists(&rp)?;
692    }
693    Ok(())
694}
695
696/// True if `layer` has at least one non-empty child type-folder in `stats`.
697fn layer_has_children(stats: &BTreeMap<PathBuf, FolderStat>, layer: Layer) -> bool {
698    let prefix = format!("{}/", layer_dir_name(layer));
699    stats
700        .iter()
701        .any(|(tf, s)| s.count > 0 && path_to_unix(tf).starts_with(&prefix))
702}
703
704/// Render a layer `index.md` from the prebuilt per-folder stat cache — each
705/// child's count + newest summary/updated come from its single cached sidecar
706/// read, so the rollup matches the folder artifacts exactly (write-through and
707/// rebuild alike) without re-reading any sidecar.
708fn render_layer_md_from_stats(
709    layer: Layer,
710    stats: &BTreeMap<PathBuf, FolderStat>,
711    folders: &BTreeMap<String, FolderMeta>,
712) -> String {
713    let layer_dir = layer_dir_name(layer);
714    let prefix = format!("{layer_dir}/");
715    let mut max_upd: Option<DateTime<FixedOffset>> = None;
716    let mut entries = String::new();
717    for (tf, stat) in stats {
718        if stat.count == 0 || !path_to_unix(tf).starts_with(&prefix) {
719            continue;
720        }
721        if let Some(u) = stat.newest.as_ref().and_then(|r| r.updated) {
722            max_upd = Some(match max_upd {
723                Some(cur) if cur >= u => cur,
724                _ => u,
725            });
726        }
727        let tf_unix = path_to_unix(tf);
728        let (display, description) = folder_label(&tf_unix, folder_basename(tf), folders);
729        entries.push_str(&folder_entry(&tf_unix, &display, stat.count, description));
730    }
731    let mut s = String::new();
732    s.push_str("---\n");
733    s.push_str("type: index\n");
734    s.push_str("scope: layer\n");
735    s.push_str(&format!("folder: {layer_dir}\n"));
736    if let Some(ts) = max_upd {
737        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
738    }
739    s.push_str("---\n\n");
740    s.push_str(&format!("# {layer_dir}\n\n"));
741    s.push_str(&entries);
742    s
743}
744
745/// Render the root `index.md` from the prebuilt per-folder stat cache.
746fn render_root_md_from_stats(
747    stats: &BTreeMap<PathBuf, FolderStat>,
748    folders: &BTreeMap<String, FolderMeta>,
749) -> String {
750    let mut max_upd: Option<DateTime<FixedOffset>> = None;
751    for stat in stats.values() {
752        if stat.count == 0 {
753            continue;
754        }
755        if let Some(u) = stat.newest.as_ref().and_then(|r| r.updated) {
756            max_upd = Some(match max_upd {
757                Some(cur) if cur >= u => cur,
758                _ => u,
759            });
760        }
761    }
762    let mut s = String::new();
763    s.push_str("---\n");
764    s.push_str("type: index\n");
765    s.push_str("scope: root\n");
766    if let Some(ts) = max_upd {
767        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
768    }
769    s.push_str("---\n\n");
770    s.push_str(&format!("# {ROOT_TITLE}\n"));
771    for layer in Layer::all() {
772        let layer_dir = layer_dir_name(layer);
773        let prefix = format!("{layer_dir}/");
774        let children: Vec<(&PathBuf, usize)> = stats
775            .iter()
776            .filter(|(tf, s)| s.count > 0 && path_to_unix(tf).starts_with(&prefix))
777            .map(|(tf, s)| (tf, s.count))
778            .collect();
779        if children.is_empty() {
780            continue;
781        }
782        let total: usize = children.iter().map(|(_, n)| *n).sum();
783        s.push('\n');
784        s.push_str(&format!("## {} ({total})\n", capitalize(layer_dir)));
785        for (tf, n) in children {
786            let tf_unix = path_to_unix(tf);
787            let (display, description) = folder_label(&tf_unix, folder_basename(tf), folders);
788            s.push_str(&folder_entry(&tf_unix, &display, n, description));
789        }
790    }
791    s
792}
793
794/// Render a layer `index.md`, reading each child's newest summary + max-updated
795/// straight from its on-disk `index.jsonl` (so the rollup matches the folder
796/// artifacts exactly, write-through and rebuild alike). The **sweep-path**
797/// renderer used by [`Index::rebuild_all`] / [`Index::write_level`] /
798/// [`Index::render_dry_run`]; the loop path uses the cache-based
799/// [`render_layer_md_from_stats`] to avoid re-reading sidecars.
800fn render_layer_md_with_store(store: &Store, idx: &Index) -> String {
801    let layer = match idx.level {
802        IndexLevel::Layer(l) => l,
803        _ => unreachable!("render_layer_md_with_store called on non-layer"),
804    };
805    let layer_dir = layer_dir_name(layer);
806    let mut max_upd: Option<DateTime<FixedOffset>> = None;
807    let mut entries = String::new();
808    for (tf, n) in &idx.child_counts {
809        let recs = read_jsonl_records(&store.root.join(tf).join("index.jsonl")).unwrap_or_default();
810        let newest = recs.first();
811        if let Some(u) = newest.and_then(|r| r.updated) {
812            max_upd = Some(match max_upd {
813                Some(cur) if cur >= u => cur,
814                _ => u,
815            });
816        }
817        let tf_unix = path_to_unix(tf);
818        let (display, description) =
819            folder_label(&tf_unix, folder_basename(tf), &store.config.folders);
820        entries.push_str(&folder_entry(&tf_unix, &display, *n, description));
821    }
822    let mut s = String::new();
823    s.push_str("---\n");
824    s.push_str("type: index\n");
825    s.push_str("scope: layer\n");
826    s.push_str(&format!("folder: {layer_dir}\n"));
827    if let Some(ts) = max_upd {
828        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
829    }
830    s.push_str("---\n\n");
831    s.push_str(&format!("# {layer_dir}\n\n"));
832    s.push_str(&entries);
833    s
834}
835
836/// Render the root `index.md`, taking each child's max-updated from its on-disk
837/// `index.jsonl`. The **sweep-path** renderer (the loop path uses
838/// [`render_root_md_from_stats`]).
839fn render_root_md_with_store(store: &Store, idx: &Index) -> String {
840    let mut max_upd: Option<DateTime<FixedOffset>> = None;
841    for tf in idx.child_counts.keys() {
842        let recs = read_jsonl_records(&store.root.join(tf).join("index.jsonl")).unwrap_or_default();
843        if let Some(u) = recs.first().and_then(|r| r.updated) {
844            max_upd = Some(match max_upd {
845                Some(cur) if cur >= u => cur,
846                _ => u,
847            });
848        }
849    }
850    let mut s = String::new();
851    s.push_str("---\n");
852    s.push_str("type: index\n");
853    s.push_str("scope: root\n");
854    if let Some(ts) = max_upd {
855        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
856    }
857    s.push_str("---\n\n");
858    s.push_str(&format!("# {ROOT_TITLE}\n"));
859    for layer in Layer::all() {
860        let layer_dir = layer_dir_name(layer);
861        let prefix = format!("{layer_dir}/");
862        let children: Vec<(&PathBuf, &usize)> = idx
863            .child_counts
864            .iter()
865            .filter(|(tf, _)| path_to_unix(tf).starts_with(&prefix))
866            .collect();
867        if children.is_empty() {
868            continue;
869        }
870        let total: usize = children.iter().map(|(_, n)| **n).sum();
871        s.push('\n');
872        s.push_str(&format!("## {} ({total})\n", capitalize(layer_dir)));
873        for (tf, n) in children {
874            let tf_unix = path_to_unix(tf);
875            let (display, description) =
876                folder_label(&tf_unix, folder_basename(tf), &store.config.folders);
877            s.push_str(&folder_entry(&tf_unix, &display, *n, description));
878        }
879    }
880    s
881}
882
883/// One `index.md` browse line: `- [[path]] — summary  ·  #tag #tag` (the
884/// `  ·  #…` suffix omitted when the file has no tags). The wiki-link target is
885/// the canonical **bare** store-relative path (no `.md` extension — the
886/// doctrine the writers emit and `validate` enforces via
887/// `WIKI_LINK_HAS_EXTENSION`); the jsonl `path` keeps the real on-disk name.
888fn format_md_entry(rec: &IndexRecord) -> String {
889    let path = wiki_target(&rec.path);
890    // Collapse the summary to a single line before interpolating it into the
891    // one-line browse entry. A hand-written file may legally carry a YAML block
892    // scalar (`summary: |-`) whose value spans multiple lines; rendered verbatim
893    // those embedded newlines break the line-oriented `index.md` format and can
894    // forge a standalone catalog entry (`\n- [[…|Click me]] — injected`). The
895    // CLI writers already collapse whitespace; do the same here so the spec's
896    // primary write path (agents writing files directly) can't corrupt the
897    // catalog.
898    let summary = collapse_whitespace(&rec.summary);
899    let mut line = format!("- [[{path}]] — {summary}");
900    if !rec.tags.is_empty() {
901        let tags = rec
902            .tags
903            .iter()
904            .map(|t| format!("#{t}"))
905            .collect::<Vec<_>>()
906            .join(" ");
907        line.push_str(&format!("  ·  {tags}"));
908    }
909    line
910}
911
912/// The deterministic `## More` footer for an over-cap type-folder.
913fn more_footer(total: usize, type_: &str, layer: &str) -> String {
914    format!(
915        "## More\n\nThis folder has {total} files. The {MD_CAP} most recent are listed above.\nUse `dbmd index query --type {type_} --in {layer}` for the complete catalog.\n"
916    )
917}
918
919/// Canonical total order: `updated` descending (None sorts last), ties broken
920/// by store-relative path ascending. A *total* order, so write-through and
921/// rebuild never disagree on #500 vs #501.
922fn sort_records(records: &mut [IndexRecord]) {
923    records.sort_by(record_recency_cmp);
924}
925
926impl IndexRecord {
927    /// Build the [`IndexRecord`] a freshly-rebuilt `index.jsonl` *should* hold
928    /// for the file at `abs` (catalogued under store-relative `rel`).
929    ///
930    /// This is the single canonical projection from frontmatter → sidecar
931    /// record: [`Index::build_type_folder`] uses the same path to write the
932    /// jsonl, so the validator can rebuild the expected record here and compare
933    /// it field-for-field against the committed line — covering **every**
934    /// queryable/dedup field the query path reads (`summary`, `type`, `tags`,
935    /// `links`, `created`, `updated`, and every type-specific `fields` entry
936    /// like `email` / `domain` / `company` / `amount` / `vendor`) without the
937    /// validator hand-rolling (and drifting from) the projection per field.
938    pub(crate) fn expected_from_file(abs: &Path, rel: PathBuf) -> crate::Result<IndexRecord> {
939        record_from_file(abs, rel)
940    }
941}
942
943/// Build an [`IndexRecord`] from a file on disk. Missing `summary` →
944/// [`MISSING_SUMMARY`] placeholder (the index never invents a summary).
945fn record_from_file(abs: &Path, rel: PathBuf) -> crate::Result<IndexRecord> {
946    let mut meta = read_frontmatter(abs)?;
947    // Records carry an effective `meta-type` in the catalog: the declared value
948    // (already spilled into `fields` by `read_frontmatter`), or the default
949    // `fact` when absent — so `--where meta-type=fact` sees un-annotated records.
950    // Sources are evidence and carry no meta-type.
951    if rel.starts_with("records") {
952        meta.fields
953            .entry("meta-type".to_string())
954            .or_insert_with(|| Value::String("fact".to_string()));
955    }
956    Ok(IndexRecord {
957        path: rel,
958        type_: meta.type_.unwrap_or_default(),
959        summary: meta.summary.unwrap_or_else(|| MISSING_SUMMARY.to_string()),
960        tags: meta.tags,
961        links: meta.links,
962        created: meta.created,
963        updated: meta.updated,
964        fields: meta.fields,
965    })
966}
967
968/// The slice of a frontmatter this module needs.
969struct FileMeta {
970    type_: Option<String>,
971    summary: Option<String>,
972    tags: Vec<String>,
973    links: Vec<String>,
974    created: Option<DateTime<FixedOffset>>,
975    updated: Option<DateTime<FixedOffset>>,
976    fields: BTreeMap<String, Value>,
977}
978
979/// Minimal frontmatter read: split the leading `---`…`---` block and parse it
980/// as YAML, extracting the typed fields and spilling the rest into `fields`.
981/// Self-contained (does not route through the `parser` module).
982///
983/// **Body bytes are never required to be UTF-8.** `sources/` is "preserved
984/// verbatim" per the SPEC and routinely carries non-UTF-8 imports (Latin-1
985/// emails dropped in by `rsync`/`mbsync`/`cp`); the body can hold any byte. We
986/// read the file as raw bytes and lossily decode *only* the leading frontmatter
987/// region, so a stray non-UTF-8 byte in the body can never abort the projection
988/// (the old `fs::read_to_string` failed on the first such byte anywhere in the
989/// file, taking a whole `rebuild_all` / write-through down with it). The
990/// frontmatter itself is expected to be UTF-8; if it isn't, `U+FFFD` markers
991/// surface in the parsed values rather than a hard abort.
992fn read_frontmatter(abs: &Path) -> crate::Result<FileMeta> {
993    let bytes = fs::read(abs)?;
994    let yaml = extract_frontmatter_block_lossy(&bytes).unwrap_or_default();
995    let map: serde_norway::Mapping = if yaml.trim().is_empty() {
996        serde_norway::Mapping::new()
997    } else {
998        serde_norway::from_str(&yaml).map_err(|e| {
999            crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1000                path: abs.to_path_buf(),
1001                message: format!("frontmatter YAML: {e}"),
1002            })
1003        })?
1004    };
1005
1006    let mut type_ = None;
1007    let mut summary = None;
1008    let mut tags = Vec::new();
1009    let mut links = Vec::new();
1010    let mut created = None;
1011    let mut updated = None;
1012    let mut fields = BTreeMap::new();
1013
1014    for (k, v) in map {
1015        let key = match k.as_str() {
1016            Some(s) => s.to_string(),
1017            None => continue,
1018        };
1019        match key.as_str() {
1020            // `type` and `summary` are coerced with the SAME scalar rule the
1021            // validator applies (`validate::scalar_string`: String/Number/Bool →
1022            // string). A bare `v.as_str()` returns `None` for an unquoted numeric
1023            // or boolean scalar (`summary: 2026`, `type: true`), so the index
1024            // would write the `(no summary)` / empty-type placeholder while
1025            // `dbmd validate` reads the file as HAVING that summary/type —
1026            // yielding a permanently-unfixable `INDEX_SUMMARY_MISMATCH` (every
1027            // rebuild reproduces the same mismatched placeholder). Coercing here
1028            // keeps the writer and the validator byte-for-byte in agreement.
1029            "type" => type_ = scalar_string(&v),
1030            "summary" => summary = scalar_string(&v),
1031            "tags" => tags = yaml_string_list(&v),
1032            "links" => links = yaml_string_list(&v),
1033            "created" => created = v.as_str().and_then(parse_ts),
1034            "updated" => updated = v.as_str().and_then(parse_ts),
1035            // `path`, `type`, `summary`, `tags`, `links`, `created`, `updated`
1036            // are the reserved IndexRecord keys; everything else (including
1037            // `id`, `status`, type-specific fields) goes to `fields`.
1038            "path" => {}
1039            _ => {
1040                fields.insert(key, yaml_to_json_value(&v));
1041            }
1042        }
1043    }
1044
1045    Ok(FileMeta {
1046        type_,
1047        summary,
1048        tags,
1049        links,
1050        created,
1051        updated,
1052        fields,
1053    })
1054}
1055
1056/// A YAML scalar (`String`/`Number`/`Bool`) rendered as a string; `None` for
1057/// sequences/mappings/null. **Must stay identical to `validate::scalar_string`**
1058/// so the index writer and the validator coerce `type`/`summary` the same way
1059/// (see [`read_frontmatter`]); an unquoted `summary: 2026` becomes `"2026"` in
1060/// both, not a placeholder here and a real value there.
1061fn scalar_string(v: &serde_norway::Value) -> Option<String> {
1062    match v {
1063        serde_norway::Value::String(s) => Some(s.clone()),
1064        serde_norway::Value::Number(n) => Some(n.to_string()),
1065        serde_norway::Value::Bool(b) => Some(b.to_string()),
1066        _ => None,
1067    }
1068}
1069
1070/// Lossily decode the leading frontmatter region of a file given its raw bytes,
1071/// then pull the YAML between the opening `---` and the next `---`. Only the
1072/// frontmatter region needs to be valid UTF-8 in practice; the body may carry
1073/// arbitrary bytes (a verbatim `sources/` import). Returns `None` when the file
1074/// has no frontmatter fence at its very start.
1075fn extract_frontmatter_block_lossy(bytes: &[u8]) -> Option<String> {
1076    // Decode lossily so a non-UTF-8 body byte never aborts the read. The
1077    // frontmatter is at the very start of the file, so a lossy whole-file decode
1078    // is correct for extracting it (and cheap relative to the YAML parse). A
1079    // leading UTF-8 BOM is stripped by `extract_frontmatter_block`.
1080    let text = String::from_utf8_lossy(bytes);
1081    extract_frontmatter_block(&text)
1082}
1083
1084/// Pull the YAML between a leading `---` line and the next `---` line. Returns
1085/// `None` when the file has no frontmatter fence at its very start.
1086fn extract_frontmatter_block(text: &str) -> Option<String> {
1087    let trimmed = text.strip_prefix('\u{feff}').unwrap_or(text);
1088    let mut lines = trimmed.lines();
1089    let first = lines.next()?;
1090    if first.trim_end() != "---" {
1091        return None;
1092    }
1093    let mut block = String::new();
1094    for line in lines {
1095        if line.trim_end() == "---" {
1096            return Some(block);
1097        }
1098        block.push_str(line);
1099        block.push('\n');
1100    }
1101    None // no closing fence
1102}
1103
1104/// Read a string scalar or a sequence-of-string-scalars into a `Vec<String>`.
1105/// Wiki-link items keep their `[[…]]` form verbatim.
1106fn yaml_string_list(v: &serde_norway::Value) -> Vec<String> {
1107    match v {
1108        serde_norway::Value::String(s) => vec![s.clone()],
1109        serde_norway::Value::Sequence(seq) => seq
1110            .iter()
1111            .filter_map(yaml_string_or_wiki_link_literal)
1112            .collect(),
1113        _ => Vec::new(),
1114    }
1115}
1116
1117fn yaml_string_or_wiki_link_literal(v: &serde_norway::Value) -> Option<String> {
1118    v.as_str()
1119        .map(str::to_string)
1120        .or_else(|| unquoted_wiki_link_literal(v))
1121}
1122
1123fn yaml_to_json_value(v: &serde_norway::Value) -> Value {
1124    if let Some(link) = unquoted_wiki_link_literal(v) {
1125        return Value::String(link);
1126    }
1127    match v {
1128        serde_norway::Value::String(s) => Value::String(s.clone()),
1129        serde_norway::Value::Bool(b) => Value::Bool(*b),
1130        serde_norway::Value::Number(n) => {
1131            serde_json::to_value(n).unwrap_or_else(|_| Value::String(n.to_string()))
1132        }
1133        serde_norway::Value::Sequence(seq) => {
1134            Value::Array(seq.iter().map(yaml_to_json_value).collect())
1135        }
1136        serde_norway::Value::Mapping(_) | serde_norway::Value::Tagged(_) => {
1137            serde_json::to_value(v).unwrap_or(Value::Null)
1138        }
1139        serde_norway::Value::Null => Value::Null,
1140    }
1141}
1142
1143fn unquoted_wiki_link_literal(v: &serde_norway::Value) -> Option<String> {
1144    let serde_norway::Value::Sequence(outer) = v else {
1145        return None;
1146    };
1147    if outer.len() != 1 {
1148        return None;
1149    }
1150    let serde_norway::Value::Sequence(inner) = &outer[0] else {
1151        return None;
1152    };
1153    let [serde_norway::Value::String(target)] = inner.as_slice() else {
1154        return None;
1155    };
1156    Some(format!("[[{target}]]"))
1157}
1158
1159/// Parse an RFC3339 timestamp scalar.
1160fn parse_ts(s: &str) -> Option<DateTime<FixedOffset>> {
1161    DateTime::parse_from_rfc3339(s.trim()).ok()
1162}
1163
1164/// Render a timestamp the same way `serde_json` renders an `IndexRecord`
1165/// timestamp (RFC3339, `Z` for UTC, sub-seconds preserved) so the md
1166/// frontmatter and the jsonl agree byte-for-byte.
1167fn fmt_ts(ts: &DateTime<FixedOffset>) -> String {
1168    ts.to_rfc3339_opts(SecondsFormat::AutoSi, true)
1169}
1170
1171/// Max `updated` over an iterator of optional timestamps.
1172fn max_updated<'a>(
1173    it: impl Iterator<Item = Option<&'a DateTime<FixedOffset>>>,
1174) -> Option<DateTime<FixedOffset>> {
1175    let mut best: Option<DateTime<FixedOffset>> = None;
1176    for ts in it.flatten() {
1177        best = Some(match best {
1178            Some(cur) if cur >= *ts => cur,
1179            _ => *ts,
1180        });
1181    }
1182    best
1183}
1184
1185/// Read a type-folder's `index.jsonl` into records, applying last-write-wins by
1186/// `path` over any un-compacted lines (so a half-compacted jsonl still reads
1187/// cleanly). Missing file → empty set. Returns records in canonical order.
1188fn read_jsonl_records(jsonl: &Path) -> crate::Result<Vec<IndexRecord>> {
1189    let text = match fs::read_to_string(jsonl) {
1190        Ok(t) => t,
1191        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
1192        Err(e) => return Err(e.into()),
1193    };
1194    // Last-write-wins by path; preserve only the final occurrence.
1195    let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
1196    for (i, line) in text.lines().enumerate() {
1197        if line.trim().is_empty() {
1198            continue;
1199        }
1200        let rec: IndexRecord = serde_json::from_str(line).map_err(|e| {
1201            crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1202                path: jsonl.to_path_buf(),
1203                message: format!("line {}: {e}", i + 1),
1204            })
1205        })?;
1206        by_path.insert(rec.path.clone(), rec);
1207    }
1208    let mut records: Vec<IndexRecord> = by_path.into_values().collect();
1209    sort_records(&mut records);
1210    Ok(records)
1211}
1212
1213/// The minimal rollup stat a parent index needs from one type-folder's
1214/// `index.jsonl`: how many distinct files it catalogs (`count`) and the single
1215/// newest record (`newest`, the recency-sorted `.first()` — its `updated` feeds
1216/// the parent's derived `updated`, its `summary` the layer preview). Holding the
1217/// newest record alone, rather than the whole sidecar, is what keeps a rollup
1218/// recompute cheap regardless of how large the sidecar grows.
1219#[derive(Debug, Clone, Default, PartialEq)]
1220struct FolderStat {
1221    count: usize,
1222    newest: Option<IndexRecord>,
1223}
1224
1225/// Read a type-folder's `index.jsonl` ONCE and reduce it to a [`FolderStat`]:
1226/// distinct-`path` count (last-write-wins) plus the recency-newest record. A
1227/// missing sidecar is the default (`count: 0`, `newest: None`). This is the
1228/// **loop-path** rollup primitive — one streaming pass per sidecar, never the
1229/// content tree and never the 2–3× full reparse the old
1230/// `jsonl_record_count` + `read_jsonl_records` pair did. `count` is
1231/// byte-identical to [`read_jsonl_records`]`.len()` and `newest` to its
1232/// `.first()`, so a rollup built from these stats matches the from-scratch
1233/// builders byte-for-byte.
1234fn read_folder_stat(jsonl: &Path) -> crate::Result<FolderStat> {
1235    let text = match fs::read_to_string(jsonl) {
1236        Ok(t) => t,
1237        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(FolderStat::default()),
1238        Err(e) => return Err(e.into()),
1239    };
1240    // Last-write-wins by path, exactly like `read_jsonl_records`, so count and
1241    // newest are computed over the same compacted record set.
1242    let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
1243    for (i, line) in text.lines().enumerate() {
1244        if line.trim().is_empty() {
1245            continue;
1246        }
1247        let rec: IndexRecord = serde_json::from_str(line).map_err(|e| {
1248            crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1249                path: jsonl.to_path_buf(),
1250                message: format!("line {}: {e}", i + 1),
1251            })
1252        })?;
1253        by_path.insert(rec.path.clone(), rec);
1254    }
1255    let count = by_path.len();
1256    // The newest record is the minimum under `sort_records`' order (updated
1257    // desc, None last, ties by path asc) — i.e. what `.first()` returns. Find it
1258    // with a single min-scan instead of sorting the whole set.
1259    let newest = by_path.into_values().min_by(record_recency_cmp);
1260    Ok(FolderStat { count, newest })
1261}
1262
1263/// The total order [`sort_records`] imposes, as a comparator over two records:
1264/// `updated` descending (None last), ties broken by store-relative path
1265/// ascending. Kept in one place so `read_folder_stat`'s min-scan agrees with the
1266/// sort byte-for-byte on which record is "newest".
1267fn record_recency_cmp(a: &IndexRecord, b: &IndexRecord) -> std::cmp::Ordering {
1268    match (b.updated, a.updated) {
1269        (Some(bu), Some(au)) => bu.cmp(&au),
1270        (Some(_), None) => std::cmp::Ordering::Greater, // a is None → after b
1271        (None, Some(_)) => std::cmp::Ordering::Less,    // b is None → after a
1272        (None, None) => std::cmp::Ordering::Equal,
1273    }
1274    .then_with(|| a.path.cmp(&b.path))
1275}
1276
1277/// Per-child rollup stats for `layers`, read from each type-folder's on-disk
1278/// `index.jsonl` (one [`read_folder_stat`] pass each) rather than walked from the
1279/// content tree. The **loop-path** counterpart to the from-scratch counting in
1280/// [`Index::build_layer`] / [`Index::build_root`], reusing one read per sidecar
1281/// across BOTH the layer and root rollups. Empty folders (`count == 0`) are kept
1282/// out of the map.
1283///
1284/// NOTE on cost: this performs one read per type-folder, but each read line-parses
1285/// that folder's entire `index.jsonl`, so the total is `O(total catalogued
1286/// records)`, not `O(type-folders)` — it reads the whole catalog every call. It
1287/// avoids the content-tree walk ([`Store::walk`]), but it is NOT `O(changed)`. See
1288/// [`update_parents`] for the honest bound and the follow-up to fix it.
1289fn collect_child_stats(
1290    store: &Store,
1291    layers: &[Layer],
1292) -> crate::Result<BTreeMap<PathBuf, FolderStat>> {
1293    let mut stats = BTreeMap::new();
1294    for &layer in layers {
1295        for tf in type_folders_in_layer(store, layer) {
1296            let stat = read_folder_stat(&store.root.join(&tf).join("index.jsonl"))?;
1297            if stat.count > 0 {
1298                stats.insert(tf, stat);
1299            }
1300        }
1301    }
1302    Ok(stats)
1303}
1304
1305/// Walk a type-folder's `.md` content files, recursing through date-shards,
1306/// excluding the `index.md` artifact itself and any hidden entries.
1307fn walk_type_folder_files(folder_abs: &Path) -> Vec<PathBuf> {
1308    let mut out = Vec::new();
1309    if !folder_abs.is_dir() {
1310        return out;
1311    }
1312    for entry in walkdir::WalkDir::new(folder_abs)
1313        .into_iter()
1314        .filter_entry(|e| !is_hidden(e.file_name()))
1315        .filter_map(|e| e.ok())
1316    {
1317        if !entry.file_type().is_file() {
1318            continue;
1319        }
1320        let p = entry.path();
1321        if p.extension().and_then(|e| e.to_str()) != Some("md") {
1322            continue;
1323        }
1324        if p.file_name().and_then(|n| n.to_str()) == Some("index.md") {
1325            continue;
1326        }
1327        out.push(p.to_path_buf());
1328    }
1329    out
1330}
1331
1332/// The immediate type-folders under a layer (one directory level below the
1333/// layer dir), as store-relative paths. Hidden dirs and `log/` are skipped.
1334fn type_folders_in_layer(store: &Store, layer: Layer) -> Vec<PathBuf> {
1335    let layer_dir = store.root.join(layer_dir_name(layer));
1336    let mut out = Vec::new();
1337    let rd = match fs::read_dir(&layer_dir) {
1338        Ok(rd) => rd,
1339        Err(_) => return out,
1340    };
1341    for entry in rd.flatten() {
1342        if !entry.path().is_dir() {
1343            continue;
1344        }
1345        let name = entry.file_name();
1346        let name = match name.to_str() {
1347            Some(n) => n,
1348            None => continue,
1349        };
1350        if is_hidden(entry.file_name().as_os_str()) || name == "log" {
1351            continue;
1352        }
1353        out.push(PathBuf::from(layer_dir_name(layer)).join(name));
1354    }
1355    out.sort();
1356    out
1357}
1358
1359/// The type-folder a content file belongs to: `<layer>/<type>` (the first two
1360/// path components), or `None` if the path is not under a known layer with at
1361/// least a type segment.
1362fn type_folder_of(file_rel: &Path) -> Option<PathBuf> {
1363    let mut comps = file_rel.components();
1364    let layer = comps.next()?.as_os_str().to_str()?;
1365    layer_from_dir_name(layer)?;
1366    let type_seg = comps.next()?.as_os_str().to_str()?;
1367    Some(PathBuf::from(layer).join(type_seg))
1368}
1369
1370/// Convert an absolute path under `root` to a store-relative path.
1371fn rel_to_store(root: &Path, abs: &Path) -> Option<PathBuf> {
1372    abs.strip_prefix(root).ok().map(|p| p.to_path_buf())
1373}
1374
1375/// Normalize a possibly-absolute or `./`-prefixed path to a clean
1376/// store-relative form (drops a leading `./`; leaves already-relative paths).
1377fn normalize_rel(p: &Path) -> PathBuf {
1378    let s = path_to_unix(p);
1379    let s = s.strip_prefix("./").unwrap_or(&s);
1380    PathBuf::from(s)
1381}
1382
1383fn is_index_artifact(p: &Path) -> bool {
1384    matches!(
1385        p.file_name().and_then(|n| n.to_str()),
1386        Some("index.md") | Some("index.jsonl")
1387    )
1388}
1389
1390/// True when a file named `index.md` / `index.jsonl` is safe for [`Index::cleanup`]
1391/// to delete — i.e. it is a generated catalog artifact (or a stale/garbage
1392/// leftover from a previous build), NOT a user content file that merely happens
1393/// to be named `index.md`.
1394///
1395/// - `index.jsonl` is always a machine artifact (content files are `.md`), so it
1396///   is always deletable.
1397/// - `index.md` is deletable UNLESS it parses as a content file — frontmatter
1398///   whose `type` is some real record type (anything other than `index`). A
1399///   generated catalog carries `type: index`; a user record carries its own type
1400///   (`email`, `note`, …) and must be preserved (deleting it is silent,
1401///   unrecoverable data loss). A leftover with no/garbage frontmatter (e.g. a
1402///   bare `stale\n`) is treated as a deletable stale artifact.
1403fn is_deletable_catalog_artifact(p: &Path) -> bool {
1404    match p.file_name().and_then(|n| n.to_str()) {
1405        Some("index.jsonl") => true,
1406        Some("index.md") => match read_frontmatter(p) {
1407            // Real content file (non-`index` type) → preserve, never delete.
1408            Ok(meta) => meta.type_.as_deref().is_none_or(|t| t == "index"),
1409            // Unreadable / no frontmatter → a stale or garbage artifact, deletable.
1410            Err(_) => true,
1411        },
1412        _ => false,
1413    }
1414}
1415
1416fn is_hidden(name: &std::ffi::OsStr) -> bool {
1417    name.to_str().map(|s| s.starts_with('.')).unwrap_or(false)
1418}
1419
1420fn layer_dir_name(layer: Layer) -> &'static str {
1421    match layer {
1422        Layer::Sources => "sources",
1423        Layer::Records => "records",
1424    }
1425}
1426
1427/// Local layer-name parse. Mirrors the contract of [`Layer::from_dir_name`];
1428/// kept local to keep this module's walk self-contained (see the module header).
1429fn layer_from_dir_name(name: &str) -> Option<Layer> {
1430    match name {
1431        "sources" => Some(Layer::Sources),
1432        "records" => Some(Layer::Records),
1433        _ => None,
1434    }
1435}
1436
1437/// The final path component as a `&str` (folder basename).
1438fn folder_basename(p: &Path) -> &str {
1439    p.file_name().and_then(|n| n.to_str()).unwrap_or("")
1440}
1441
1442/// The canonical wiki-link target for a content path: the store-relative path
1443/// with `/` separators and the trailing `.md` stripped (the bare form the
1444/// `index.md` browse view links to).
1445fn wiki_target(p: &Path) -> String {
1446    let unix = path_to_unix(p);
1447    unix.strip_suffix(".md").unwrap_or(&unix).to_string()
1448}
1449
1450/// Render a path with `/` separators regardless of host OS, so artifacts are
1451/// identical on every platform.
1452///
1453/// A non-UTF-8 path component (reachable on Linux/ext4, db.md's primary
1454/// deployment target, where `sources/` files arrive verbatim from Latin-1
1455/// exports) is decoded **lossily** with `U+FFFD` markers rather than silently
1456/// dropped. The old `filter_map(|c| c.as_os_str().to_str())` dropped any bad
1457/// component entirely, so `sources/emails/caf\xe9.md` serialized as
1458/// `sources/emails` — a path pointing at the *directory*, not the file, that
1459/// also collapsed distinct files onto one `index.jsonl` key. Lossy decoding
1460/// keeps the leaf present and visibly marked.
1461fn path_to_unix(p: &Path) -> String {
1462    p.components()
1463        .map(|c| c.as_os_str().to_string_lossy().into_owned())
1464        .collect::<Vec<_>>()
1465        .join("/")
1466}
1467
1468/// Serde for [`IndexRecord::path`]: always forward-slash on the wire, so the
1469/// `index.jsonl` catalog is identical whether the store was written on POSIX or
1470/// Windows (a git clone across OSes yields the same paths, and the last-write-
1471/// wins upsert key never splits on separator style). On POSIX this matches the
1472/// default `PathBuf` serialization; on Windows it rewrites `\` to `/`.
1473mod path_serde {
1474    use super::path_to_unix;
1475    use serde::{Deserialize, Deserializer, Serializer};
1476    use std::path::{Path, PathBuf};
1477
1478    pub fn serialize<S: Serializer>(p: &Path, s: S) -> Result<S::Ok, S::Error> {
1479        s.serialize_str(&path_to_unix(p))
1480    }
1481
1482    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<PathBuf, D::Error> {
1483        Ok(PathBuf::from(String::deserialize(d)?))
1484    }
1485}
1486
1487/// ASCII-capitalize the first character.
1488fn capitalize(s: &str) -> String {
1489    let mut chars = s.chars();
1490    match chars.next() {
1491        Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
1492        None => String::new(),
1493    }
1494}
1495
1496/// Collapse all runs of whitespace (including newlines) into single spaces and
1497/// trim the ends — the single-line normalization the `index.md` browse entry
1498/// ([`format_md_entry`]) applies so a multi-line block-scalar summary can never
1499/// inject a newline into a catalog line.
1500fn collapse_whitespace(s: &str) -> String {
1501    s.split_whitespace().collect::<Vec<_>>().join(" ")
1502}
1503
1504/// Derive a folder's display name from its basename: separators (`-`, `_`)
1505/// become spaces and the first character is upper-cased (`hubspot-exports` →
1506/// `Hubspot exports`). A deterministic floor — the curator overrides it via
1507/// `DB.md ## Folders` (`records/x|HubSpot exports`) for casing the tool cannot
1508/// guess. The tool tidies a folder's *name*; it never infers its *meaning*.
1509fn default_display(basename: &str) -> String {
1510    let spaced: String = basename
1511        .chars()
1512        .map(|c| if c == '-' || c == '_' { ' ' } else { c })
1513        .collect();
1514    capitalize(&spaced)
1515}
1516
1517/// The display name + optional description a root/layer rollup shows for a child
1518/// type-folder: the curator's `## Folders` metadata when present, else the
1519/// derived display name and **no description**. This is the whole anti-"tool
1520/// invents the curator's judgment" contract for the rollups — a description is
1521/// surfaced only when the agent authored one; it is never composed from the
1522/// folder's newest member or any other content.
1523fn folder_label<'a>(
1524    tf_unix: &str,
1525    basename: &str,
1526    folders: &'a BTreeMap<String, FolderMeta>,
1527) -> (String, Option<&'a str>) {
1528    let meta = folders.get(tf_unix);
1529    let display = meta
1530        .and_then(|m| m.display.as_deref())
1531        .map(str::to_string)
1532        .unwrap_or_else(|| default_display(basename));
1533    (display, meta.and_then(|m| m.description.as_deref()))
1534}
1535
1536/// One root/layer rollup entry: `- [[<tf>/index|<Display>]] (<count>)` with an
1537/// ` — <description>` suffix only when the curator authored one.
1538fn folder_entry(tf_unix: &str, display: &str, count: usize, description: Option<&str>) -> String {
1539    match description {
1540        Some(d) => format!("- [[{tf_unix}/index|{display}]] ({count}) — {d}\n"),
1541        None => format!("- [[{tf_unix}/index|{display}]] ({count})\n"),
1542    }
1543}
1544
1545/// Atomic (rename-based) write for the **derived** catalog (`index.md` /
1546/// `index.jsonl`). Deliberately NOT `fsync`-durable like [`crate::fsx`]: the
1547/// index is rebuildable (`dbmd index rebuild`) and this is the O(changed)
1548/// write-through path, so a per-write `fsync` would be cost without benefit — a
1549/// crash-lost catalog write is recovered by a rebuild, not data loss. (Primary
1550/// data — content records, `log.md` — uses the durable `crate::fsx` path.)
1551fn write_atomic(path: &Path, contents: String) -> crate::Result<()> {
1552    if let Some(parent) = path.parent() {
1553        fs::create_dir_all(parent)?;
1554    }
1555    let dir = path.parent().unwrap_or_else(|| Path::new("."));
1556    let mut tmp = tempfile_in(dir)?;
1557    tmp.write_all(contents.as_bytes())?;
1558    tmp.flush()?;
1559    tmp.persist(path)?;
1560    Ok(())
1561}
1562
1563fn remove_if_exists(path: &Path) -> crate::Result<()> {
1564    match fs::remove_file(path) {
1565        Ok(()) => Ok(()),
1566        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
1567        Err(e) => Err(e.into()),
1568    }
1569}
1570
1571fn bad_index(path: &Path, msg: &str) -> crate::Error {
1572    crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1573        path: path.to_path_buf(),
1574        message: msg.to_string(),
1575    })
1576}
1577
1578/// Per-type-folder advisory lock for the write-through sidecar read-modify-write.
1579///
1580/// The write-through update of a folder's `index.jsonl`/`index.md` is a
1581/// read-snapshot → modify → atomic-rename-over-whole-file sequence. The SPEC
1582/// sanctions many-writer concurrency for `records/` (`dbmd write` is
1583/// `create_new`-race-safe for the *content* file), but two concurrent writers to
1584/// the SAME type-folder would each read the same sidecar snapshot, add only their
1585/// own row, and rename their whole file over the other's — a classic lost update,
1586/// dropping most rows until a manual `dbmd index rebuild`. This lock serializes
1587/// the per-folder RMW (the content file is already serialized by `create_new`),
1588/// so concurrent sanctioned writes each see the other's row.
1589///
1590/// Implementation: a hidden `<type-folder>/.index.lock` acquired via `create_new`
1591/// (the same O_EXCL primitive `cmd/write.rs` uses), bounded-spin with a small
1592/// sleep, and stale-lock breaking by mtime age so a crashed writer can't wedge
1593/// the folder forever. The dotfile name keeps it out of the content walk
1594/// (`walk_type_folder_files` skips hidden) and out of `cleanup`
1595/// (`is_index_artifact` only matches `index.md`/`index.jsonl`). RAII: the lock is
1596/// released (file removed) on drop, including on the error paths.
1597struct FolderLock {
1598    path: PathBuf,
1599    held: bool,
1600}
1601
1602impl FolderLock {
1603    /// Acquire the lock for `folder_abs`. Spins (with a short sleep) up to a
1604    /// bounded number of attempts, breaking a lock older than the staleness
1605    /// window so a crash can't deadlock the folder. Best-effort: if the lock
1606    /// genuinely can't be taken (extremely rare contention), it proceeds
1607    /// unlocked rather than failing the write — degrading to the prior behavior
1608    /// instead of erroring a sanctioned operation.
1609    fn acquire(folder_abs: &Path) -> Self {
1610        use std::time::{Duration, SystemTime};
1611        const MAX_ATTEMPTS: u32 = 600; // ~6s at 10ms/attempt
1612        const SPIN: Duration = Duration::from_millis(10);
1613        const STALE_AFTER: Duration = Duration::from_secs(30);
1614
1615        let path = folder_abs.join(".index.lock");
1616        // Ensure the folder exists so the lockfile create can succeed.
1617        let _ = fs::create_dir_all(folder_abs);
1618        for _ in 0..MAX_ATTEMPTS {
1619            match fs::OpenOptions::new()
1620                .write(true)
1621                .create_new(true)
1622                .open(&path)
1623            {
1624                Ok(_) => {
1625                    return FolderLock { path, held: true };
1626                }
1627                Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
1628                    // Break a stale lock left by a crashed writer.
1629                    if let Ok(meta) = fs::metadata(&path) {
1630                        if let Ok(modified) = meta.modified() {
1631                            if SystemTime::now()
1632                                .duration_since(modified)
1633                                .map(|age| age > STALE_AFTER)
1634                                .unwrap_or(false)
1635                            {
1636                                let _ = fs::remove_file(&path);
1637                                continue;
1638                            }
1639                        }
1640                    }
1641                    std::thread::sleep(SPIN);
1642                }
1643                // Any other error (e.g. permissions): give up on locking and
1644                // proceed unlocked rather than failing the write.
1645                Err(_) => return FolderLock { path, held: false },
1646            }
1647        }
1648        // Contention budget exhausted: proceed unlocked (best-effort).
1649        FolderLock { path, held: false }
1650    }
1651}
1652
1653impl Drop for FolderLock {
1654    fn drop(&mut self) {
1655        if self.held {
1656            let _ = fs::remove_file(&self.path);
1657        }
1658    }
1659}
1660
1661/// Acquire the write-through lock for one or two type-folders. When `a == b`
1662/// (same-folder rename) only one lock is taken. For two distinct folders the
1663/// locks are always acquired in sorted order so a pair of concurrent renames
1664/// touching the same two folders can't deadlock by grabbing them in opposite
1665/// orders. Returns the guard(s); drop releases them.
1666fn lock_folders(store: &Store, a: &Path, b: &Path) -> Vec<FolderLock> {
1667    if a == b {
1668        return vec![FolderLock::acquire(&store.root.join(a))];
1669    }
1670    let (first, second) = if a < b { (a, b) } else { (b, a) };
1671    vec![
1672        FolderLock::acquire(&store.root.join(first)),
1673        FolderLock::acquire(&store.root.join(second)),
1674    ]
1675}
1676
1677// A tiny atomic-write helper. `tempfile` is a dev-dependency for tests; for
1678// the library path we hand-roll a temp-file-then-rename so writes are atomic
1679// without pulling `tempfile` into the non-dev dependency set. The file handle
1680// is held in an `Option` so `persist` can take it out without fighting the
1681// `Drop` impl (which only cleans up an un-persisted temp file).
1682struct AtomicTemp {
1683    file: Option<fs::File>,
1684    path: PathBuf,
1685    persisted: bool,
1686}
1687
1688impl AtomicTemp {
1689    fn write_all(&mut self, bytes: &[u8]) -> std::io::Result<()> {
1690        self.file.as_mut().expect("temp file open").write_all(bytes)
1691    }
1692    fn flush(&mut self) -> std::io::Result<()> {
1693        self.file.as_mut().expect("temp file open").flush()
1694    }
1695    fn persist(mut self, dest: &Path) -> std::io::Result<()> {
1696        if let Some(f) = self.file.take() {
1697            f.sync_all().ok();
1698            // `f` dropped here, closing the handle before the rename.
1699        }
1700        fs::rename(&self.path, dest)?;
1701        self.persisted = true;
1702        Ok(())
1703    }
1704}
1705
1706impl Drop for AtomicTemp {
1707    fn drop(&mut self) {
1708        // Best-effort cleanup if not persisted (an error path bailed out).
1709        if !self.persisted {
1710            let _ = fs::remove_file(&self.path);
1711        }
1712    }
1713}
1714
1715fn tempfile_in(dir: &Path) -> std::io::Result<AtomicTemp> {
1716    use std::time::{SystemTime, UNIX_EPOCH};
1717    let nanos = SystemTime::now()
1718        .duration_since(UNIX_EPOCH)
1719        .map(|d| d.as_nanos())
1720        .unwrap_or(0);
1721    let pid = std::process::id();
1722    // Monotonic-ish unique suffix; the dir is the destination dir so rename is
1723    // same-filesystem and therefore atomic.
1724    let counter = next_temp_counter();
1725    let name = format!(".dbmd-index-{pid}-{nanos}-{counter}.tmp");
1726    let path = dir.join(name);
1727    let file = fs::OpenOptions::new()
1728        .write(true)
1729        .create_new(true)
1730        .open(&path)?;
1731    Ok(AtomicTemp {
1732        file: Some(file),
1733        path,
1734        persisted: false,
1735    })
1736}
1737
1738fn next_temp_counter() -> u64 {
1739    use std::sync::atomic::{AtomicU64, Ordering};
1740    static C: AtomicU64 = AtomicU64::new(0);
1741    C.fetch_add(1, Ordering::Relaxed)
1742}
1743
1744#[cfg(test)]
1745mod tests {
1746    use super::*;
1747    use std::collections::BTreeSet;
1748    use std::fs;
1749    use tempfile::TempDir;
1750
1751    // ── fixtures ─────────────────────────────────────────────────────────
1752
1753    /// A temp store with a `DB.md` marker. `store.config` is the parser default
1754    /// (these tests never exercise the config parser).
1755    fn mk_store() -> (TempDir, Store) {
1756        let dir = TempDir::new().unwrap();
1757        fs::write(dir.path().join("DB.md"), "# test store\n").unwrap();
1758        let store = Store {
1759            root: dir.path().to_path_buf(),
1760            config: crate::parser::Config::default(),
1761        };
1762        (dir, store)
1763    }
1764
1765    /// Write a content file at `rel` with the given frontmatter lines + body.
1766    /// `fm` is the raw YAML body between the fences (no `---`).
1767    fn write_raw(store: &Store, rel: &str, fm: &str, body: &str) {
1768        let abs = store.root.join(rel);
1769        fs::create_dir_all(abs.parent().unwrap()).unwrap();
1770        fs::write(&abs, format!("---\n{fm}\n---\n{body}")).unwrap();
1771    }
1772
1773    /// Convenience: write a typed content file with summary/updated/extras.
1774    fn write_doc(
1775        store: &Store,
1776        rel: &str,
1777        type_: &str,
1778        summary: Option<&str>,
1779        updated: Option<&str>,
1780        extra_yaml: &str,
1781    ) {
1782        let mut fm = format!("type: {type_}\n");
1783        if let Some(s) = summary {
1784            fm.push_str(&format!("summary: {s}\n"));
1785        }
1786        if let Some(u) = updated {
1787            fm.push_str(&format!("updated: {u}\n"));
1788        }
1789        fm.push_str(extra_yaml);
1790        write_raw(store, rel, fm.trim_end(), "\nbody text\n");
1791    }
1792
1793    fn read(store: &Store, rel: &str) -> String {
1794        fs::read_to_string(store.root.join(rel)).unwrap()
1795    }
1796
1797    fn exists(store: &Store, rel: &str) -> bool {
1798        store.root.join(rel).exists()
1799    }
1800
1801    /// Collect every `index.md` + `index.jsonl` under the store, mapped to its
1802    /// bytes — the surface the byte-identity invariant compares.
1803    fn snapshot_artifacts(store: &Store) -> BTreeMap<String, String> {
1804        let mut out = BTreeMap::new();
1805        for entry in walkdir::WalkDir::new(&store.root)
1806            .into_iter()
1807            .filter_map(|e| e.ok())
1808        {
1809            let p = entry.path();
1810            if is_index_artifact(p) {
1811                let rel = path_to_unix(&rel_to_store(&store.root, p).unwrap());
1812                out.insert(rel, fs::read_to_string(p).unwrap());
1813            }
1814        }
1815        out
1816    }
1817
1818    // ── build_type_folder + to_markdown ──────────────────────────────────
1819
1820    #[test]
1821    fn type_folder_aggregates_across_shards_in_recency_order() {
1822        let (_d, store) = mk_store();
1823        // Three emails across two month-shards, deliberately written
1824        // out-of-recency-order on disk.
1825        write_doc(
1826            &store,
1827            "sources/emails/2026/05/b-old.md",
1828            "email",
1829            Some("Older mail"),
1830            Some("2026-05-01T09:00:00Z"),
1831            "",
1832        );
1833        write_doc(
1834            &store,
1835            "sources/emails/2026/06/c-new.md",
1836            "email",
1837            Some("Newest mail"),
1838            Some("2026-06-15T12:00:00Z"),
1839            "",
1840        );
1841        write_doc(
1842            &store,
1843            "sources/emails/2026/05/a-mid.md",
1844            "email",
1845            Some("Middle mail"),
1846            Some("2026-05-20T08:00:00Z"),
1847            "",
1848        );
1849
1850        let idx = Index::build_type_folder(&store, Path::new("sources/emails")).unwrap();
1851        let paths: Vec<String> = idx.records.iter().map(|r| path_to_unix(&r.path)).collect();
1852        assert_eq!(
1853            paths,
1854            vec![
1855                "sources/emails/2026/06/c-new.md",
1856                "sources/emails/2026/05/a-mid.md",
1857                "sources/emails/2026/05/b-old.md",
1858            ],
1859            "records must aggregate across shards, newest `updated` first"
1860        );
1861    }
1862
1863    #[test]
1864    fn type_folder_md_format_entries_tags_and_derived_updated() {
1865        let (_d, store) = mk_store();
1866        write_doc(
1867            &store,
1868            "records/contacts/sarah-chen.md",
1869            "contact",
1870            Some("Renewal champion at Acme"),
1871            Some("2026-05-27T10:00:00Z"),
1872            "tags:\n  - renewal\n  - acme\n",
1873        );
1874        write_doc(
1875            &store,
1876            "records/contacts/no-tags.md",
1877            "contact",
1878            Some("Plain contact"),
1879            Some("2026-05-26T10:00:00Z"),
1880            "",
1881        );
1882
1883        let idx = Index::build_type_folder(&store, Path::new("records/contacts")).unwrap();
1884        let md = idx.to_markdown();
1885
1886        // Frontmatter is exact and the index's own `updated` is the MAX member
1887        // updated (the determinism the byte-identity invariant rests on).
1888        assert!(md.starts_with(
1889            "---\ntype: index\nscope: type-folder\nfolder: records/contacts\nupdated: 2026-05-27T10:00:00Z\n---\n\n# records/contacts\n"
1890        ), "frontmatter/heading wrong:\n{md}");
1891
1892        // Entry with tags: `— summary  ·  #tag #tag`.
1893        assert!(
1894            md.contains(
1895                "- [[records/contacts/sarah-chen]] — Renewal champion at Acme  ·  #renewal #acme\n"
1896            ),
1897            "tagged entry wrong:\n{md}"
1898        );
1899        // Entry without tags omits the `  ·  ` suffix entirely.
1900        assert!(
1901            md.contains("- [[records/contacts/no-tags]] — Plain contact\n"),
1902            "untagged entry wrong:\n{md}"
1903        );
1904        assert!(
1905            !md.contains("Plain contact  ·"),
1906            "untagged entry must not emit a tag separator"
1907        );
1908        // No `## More` below the cap.
1909        assert!(!md.contains("## More"), "no footer expected under the cap");
1910    }
1911
1912    #[test]
1913    fn missing_summary_becomes_placeholder_not_invented() {
1914        let (_d, store) = mk_store();
1915        write_doc(
1916            &store,
1917            "records/notes/x.md",
1918            "note",
1919            None,
1920            Some("2026-05-27T10:00:00Z"),
1921            "",
1922        );
1923        let idx = Index::build_type_folder(&store, Path::new("records/notes")).unwrap();
1924        assert_eq!(idx.records[0].summary, MISSING_SUMMARY);
1925        let md = idx.to_markdown();
1926        assert!(
1927            md.contains("- [[records/notes/x]] — (no summary)\n"),
1928            "missing summary must render the placeholder, not invent text:\n{md}"
1929        );
1930    }
1931
1932    // ── to_jsonl ─────────────────────────────────────────────────────────
1933
1934    #[test]
1935    fn jsonl_is_complete_structured_and_round_trips() {
1936        let (_d, store) = mk_store();
1937        write_doc(
1938            &store,
1939            "records/expenses/2026/05/e1.md",
1940            "expense",
1941            Some("Lunch with vendor"),
1942            Some("2026-05-10T10:00:00Z"),
1943            "created: 2026-05-10T09:00:00Z\nstatus: paid\namount: 42\ncompany: [[records/companies/acme]]\nrelated:\n  - [[records/concepts/spend]]\ntags:\n  - food\nlinks:\n  - records/concepts/spend\n  - [[records/concepts/renewal]]\n",
1944        );
1945        write_doc(
1946            &store,
1947            "records/expenses/2026/06/e2.md",
1948            "expense",
1949            Some("Cloud bill"),
1950            Some("2026-06-01T10:00:00Z"),
1951            "amount: 100\n",
1952        );
1953
1954        let idx = Index::build_type_folder(&store, Path::new("records/expenses")).unwrap();
1955        let jsonl = idx.to_jsonl();
1956        let lines: Vec<&str> = jsonl.lines().collect();
1957        assert_eq!(lines.len(), 2, "one JSON object per file, uncapped");
1958
1959        // Newest first (e2), and each line parses back to an equal record.
1960        let r0: IndexRecord = serde_json::from_str(lines[0]).unwrap();
1961        assert_eq!(path_to_unix(&r0.path), "records/expenses/2026/06/e2.md");
1962        assert_eq!(
1963            r0, idx.records[0],
1964            "jsonl line must round-trip to the record"
1965        );
1966
1967        // The first (data) record carries every reserved field + the extras in
1968        // `fields` (status/amount), and links/tags verbatim.
1969        let r1: IndexRecord = serde_json::from_str(lines[1]).unwrap();
1970        assert_eq!(r1.type_, "expense");
1971        assert_eq!(r1.summary, "Lunch with vendor");
1972        assert_eq!(r1.tags, vec!["food".to_string()]);
1973        assert_eq!(
1974            r1.links,
1975            vec![
1976                "records/concepts/spend".to_string(),
1977                "[[records/concepts/renewal]]".to_string()
1978            ]
1979        );
1980        assert_eq!(
1981            r1.created,
1982            Some(DateTime::parse_from_rfc3339("2026-05-10T09:00:00Z").unwrap())
1983        );
1984        assert_eq!(r1.fields.get("status"), Some(&Value::from("paid")));
1985        assert_eq!(r1.fields.get("amount"), Some(&Value::from(42)));
1986        assert_eq!(
1987            r1.fields.get("company"),
1988            Some(&Value::from("[[records/companies/acme]]"))
1989        );
1990        assert_eq!(
1991            r1.fields.get("related"),
1992            Some(&serde_json::json!(["[[records/concepts/spend]]"]))
1993        );
1994        // Reserved keys never leak into `fields`.
1995        for reserved in [
1996            "path", "type", "summary", "tags", "links", "created", "updated",
1997        ] {
1998            assert!(
1999                !r1.fields.contains_key(reserved),
2000                "reserved key {reserved} must not appear in fields"
2001            );
2002        }
2003
2004        // Stable key order: declared fields first, then sorted extras.
2005        assert!(
2006            lines[1].starts_with(
2007                r#"{"path":"records/expenses/2026/05/e1.md","type":"expense","summary":"Lunch with vendor","tags":["food"],"links":["records/concepts/spend","[[records/concepts/renewal]]"],"created":"2026-05-10T09:00:00Z","updated":"2026-05-10T10:00:00Z","#
2008            ),
2009            "jsonl key order not stable:\n{}",
2010            lines[1]
2011        );
2012        // The flattened extras come in BTreeMap (sorted) order. The catalog
2013        // injects `meta-type: fact` into every records-layer file that does not
2014        // declare one, so it appears among the sorted extras (between `company`
2015        // and `related`).
2016        assert!(
2017            lines[1].ends_with(r#""amount":42,"company":"[[records/companies/acme]]","meta-type":"fact","related":["[[records/concepts/spend]]"],"status":"paid"}"#),
2018            "extras must be sorted:\n{}",
2019            lines[1]
2020        );
2021    }
2022
2023    // ── cap + footer ─────────────────────────────────────────────────────
2024
2025    #[test]
2026    fn over_cap_md_shows_500_plus_footer_jsonl_holds_all() {
2027        let (_d, store) = mk_store();
2028        let total = MD_CAP + 7;
2029        for i in 0..total {
2030            // Distinct, monotonically increasing `updated` so order is total.
2031            let day = 1 + (i % 27);
2032            let rel = format!("sources/emails/2026/05/m-{i:04}.md");
2033            let updated = format!("2026-05-{day:02}T00:00:{:02}Z", i % 60);
2034            write_doc(
2035                &store,
2036                &rel,
2037                "email",
2038                Some(&format!("mail {i}")),
2039                Some(&updated),
2040                "",
2041            );
2042        }
2043        let idx = Index::build_type_folder(&store, Path::new("sources/emails")).unwrap();
2044        assert_eq!(idx.records.len(), total, "jsonl/records keep every file");
2045
2046        let md = idx.to_markdown();
2047        let entry_lines = md.lines().filter(|l| l.starts_with("- [[")).count();
2048        assert_eq!(entry_lines, MD_CAP, "md browse view is capped at 500");
2049
2050        assert!(
2051            md.contains("## More\n\n"),
2052            "over-cap md needs a More footer"
2053        );
2054        assert!(
2055            md.contains(&format!(
2056                "This folder has {total} files. The 500 most recent are listed above.\n"
2057            )),
2058            "footer count wrong:\n{md}"
2059        );
2060        assert!(
2061            md.contains(
2062                "Use `dbmd index query --type email --in sources` for the complete catalog.\n"
2063            ),
2064            "footer must infer type=email layer=sources:\n{md}"
2065        );
2066
2067        let jsonl = idx.to_jsonl();
2068        assert_eq!(jsonl.lines().count(), total, "jsonl is uncapped");
2069    }
2070
2071    // ── sort total order ─────────────────────────────────────────────────
2072
2073    #[test]
2074    fn sort_breaks_ties_by_path_and_puts_undated_last() {
2075        let mut recs = vec![
2076            rec("z/a.md", Some("2026-05-01T00:00:00Z")),
2077            rec("a/b.md", Some("2026-05-01T00:00:00Z")), // same updated, path < z/a
2078            rec("m/c.md", None),                         // undated → last
2079            rec("b/d.md", Some("2026-06-01T00:00:00Z")), // newest
2080        ];
2081        sort_records(&mut recs);
2082        let order: Vec<String> = recs.iter().map(|r| path_to_unix(&r.path)).collect();
2083        assert_eq!(order, vec!["b/d.md", "a/b.md", "z/a.md", "m/c.md"]);
2084    }
2085
2086    fn rec(path: &str, updated: Option<&str>) -> IndexRecord {
2087        IndexRecord {
2088            path: PathBuf::from(path),
2089            type_: "t".into(),
2090            summary: "s".into(),
2091            tags: vec![],
2092            links: vec![],
2093            created: None,
2094            updated: updated.map(|u| DateTime::parse_from_rfc3339(u).unwrap()),
2095            fields: BTreeMap::new(),
2096        }
2097    }
2098
2099    // ── build_layer / build_root ─────────────────────────────────────────
2100
2101    #[test]
2102    fn layer_index_lists_type_folders_with_counts() {
2103        let (_d, store) = mk_store();
2104        write_doc(
2105            &store,
2106            "records/contacts/a.md",
2107            "contact",
2108            Some("Contact A older"),
2109            Some("2026-05-01T00:00:00Z"),
2110            "",
2111        );
2112        write_doc(
2113            &store,
2114            "records/contacts/b.md",
2115            "contact",
2116            Some("Contact B newest"),
2117            Some("2026-05-09T00:00:00Z"),
2118            "",
2119        );
2120        write_doc(
2121            &store,
2122            "records/companies/x.md",
2123            "company",
2124            Some("Acme Inc"),
2125            Some("2026-05-05T00:00:00Z"),
2126            "",
2127        );
2128        // build the type-folder artifacts first (layer preview reads their jsonl)
2129        Index::write_level(&store, &IndexLevel::TypeFolder("records/contacts".into())).unwrap();
2130        Index::write_level(&store, &IndexLevel::TypeFolder("records/companies".into())).unwrap();
2131
2132        Index::write_level(&store, &IndexLevel::Layer(Layer::Records)).unwrap();
2133        let md = read(&store, "records/index.md");
2134
2135        assert!(
2136            md.starts_with("---\ntype: index\nscope: layer\nfolder: records\n"),
2137            "layer fm:\n{md}"
2138        );
2139        // Alphabetical type-folder order: companies before contacts.
2140        let companies_at = md.find("companies/index").unwrap();
2141        let contacts_at = md.find("contacts/index").unwrap();
2142        assert!(
2143            companies_at < contacts_at,
2144            "type folders must be alphabetical"
2145        );
2146        // Count + display only — with no `## Folders`, the rollup never invents
2147        // a per-folder description from a member summary.
2148        assert!(
2149            md.contains("- [[records/contacts/index|Contacts]] (2)\n"),
2150            "contacts entry:\n{md}"
2151        );
2152        assert!(
2153            md.contains("- [[records/companies/index|Companies]] (1)\n"),
2154            "companies entry:\n{md}"
2155        );
2156        // Crucially: no member summary leaked into the rollup as a description.
2157        assert!(
2158            !md.contains("Contact B newest") && !md.contains("Acme Inc"),
2159            "layer rollup must not quote a member summary:\n{md}"
2160        );
2161        // Layer `updated` is the max across children (contacts b = 05-09).
2162        assert!(
2163            md.contains("updated: 2026-05-09T00:00:00Z\n"),
2164            "layer updated must be max child:\n{md}"
2165        );
2166    }
2167
2168    #[test]
2169    fn folders_section_supplies_authored_display_and_description() {
2170        // The aligned contract: rollups surface the curator's `## Folders`
2171        // display + description; the tool never invents one. A folder with no
2172        // entry shows counts only — no member summary leaks in as a description.
2173        let (_d, mut store) = mk_store();
2174        store.config.folders.insert(
2175            "records/contacts".into(),
2176            crate::parser::FolderMeta {
2177                display: None,
2178                description: Some("people across customer + prospect accounts".into()),
2179            },
2180        );
2181        store.config.folders.insert(
2182            "sources/hubspot-exports".into(),
2183            crate::parser::FolderMeta {
2184                display: Some("HubSpot exports".into()),
2185                description: Some("deal + pipeline exports".into()),
2186            },
2187        );
2188        write_doc(
2189            &store,
2190            "records/contacts/a.md",
2191            "contact",
2192            Some("Contact A"),
2193            Some("2026-05-01T00:00:00Z"),
2194            "",
2195        );
2196        // companies has NO `## Folders` entry → counts only.
2197        write_doc(
2198            &store,
2199            "records/companies/x.md",
2200            "company",
2201            Some("Acme Inc"),
2202            Some("2026-05-05T00:00:00Z"),
2203            "",
2204        );
2205        write_doc(
2206            &store,
2207            "sources/hubspot-exports/d.md",
2208            "hubspot-export",
2209            Some("a single deal export"),
2210            Some("2026-05-03T00:00:00Z"),
2211            "",
2212        );
2213
2214        Index::rebuild_all(&store).unwrap();
2215
2216        // Authored description surfaced (contacts), with the derived display.
2217        let records_layer = read(&store, "records/index.md");
2218        assert!(
2219            records_layer.contains("- [[records/contacts/index|Contacts]] (1) — people across customer + prospect accounts\n"),
2220            "authored description must surface:\n{records_layer}"
2221        );
2222        // No `## Folders` entry ⇒ counts only; the member summary never leaks in.
2223        assert!(
2224            records_layer.contains("- [[records/companies/index|Companies]] (1)\n")
2225                && !records_layer.contains("Acme Inc"),
2226            "un-described folder is counts-only:\n{records_layer}"
2227        );
2228
2229        // Display override beats the derived "Hubspot exports".
2230        let sources_layer = read(&store, "sources/index.md");
2231        assert!(
2232            sources_layer.contains("- [[sources/hubspot-exports/index|HubSpot exports]] (1) — deal + pipeline exports\n"),
2233            "display override + description must surface:\n{sources_layer}"
2234        );
2235
2236        // Root rollup carries the same authored metadata (display + description).
2237        let root = read(&store, "index.md");
2238        assert!(
2239            root.contains("- [[records/contacts/index|Contacts]] (1) — people across customer + prospect accounts\n"),
2240            "root surfaces authored description:\n{root}"
2241        );
2242        assert!(
2243            root.contains("- [[sources/hubspot-exports/index|HubSpot exports]] (1) — deal + pipeline exports\n"),
2244            "root surfaces display override:\n{root}"
2245        );
2246    }
2247
2248    #[test]
2249    fn default_display_turns_separators_to_spaces_and_caps() {
2250        assert_eq!(default_display("contacts"), "Contacts");
2251        assert_eq!(default_display("hubspot-exports"), "Hubspot exports");
2252        assert_eq!(default_display("usage_exports"), "Usage exports");
2253    }
2254
2255    #[test]
2256    fn root_index_groups_layers_with_totals_and_per_type_counts() {
2257        let (_d, store) = mk_store();
2258        write_doc(
2259            &store,
2260            "sources/emails/2026/05/a.md",
2261            "email",
2262            Some("Mail"),
2263            Some("2026-05-01T00:00:00Z"),
2264            "",
2265        );
2266        write_doc(
2267            &store,
2268            "sources/docs/d.md",
2269            "doc",
2270            Some("Doc"),
2271            Some("2026-05-02T00:00:00Z"),
2272            "",
2273        );
2274        write_doc(
2275            &store,
2276            "records/contacts/c.md",
2277            "contact",
2278            Some("C"),
2279            Some("2026-05-03T00:00:00Z"),
2280            "",
2281        );
2282        // wiki empty → no Wiki section
2283
2284        Index::rebuild_all(&store).unwrap();
2285        let md = read(&store, "index.md");
2286
2287        assert!(
2288            md.starts_with("---\ntype: index\nscope: root\n"),
2289            "root fm:\n{md}"
2290        );
2291        assert!(md.contains("# Knowledge base index\n"), "root title:\n{md}");
2292        // Layer heading with total count; Sources before Records (canonical).
2293        let sources_h = md
2294            .find("## Sources (2)")
2295            .expect("sources heading w/ total 2");
2296        let records_h = md
2297            .find("## Records (1)")
2298            .expect("records heading w/ total 1");
2299        assert!(sources_h < records_h, "Sources must precede Records");
2300        assert!(!md.contains("## Wiki"), "empty layer gets no section");
2301        // Per-type sub-entries with (N), no preview at root.
2302        assert!(
2303            md.contains("- [[sources/docs/index|Docs]] (1)\n"),
2304            "root docs entry:\n{md}"
2305        );
2306        assert!(
2307            md.contains("- [[sources/emails/index|Emails]] (1)\n"),
2308            "root emails entry:\n{md}"
2309        );
2310        assert!(
2311            md.contains("- [[records/contacts/index|Contacts]] (1)\n"),
2312            "root contacts entry:\n{md}"
2313        );
2314        assert!(!md.contains("— "), "root entries carry no preview text");
2315    }
2316
2317    // ── write-through == rebuild (THE invariant) ─────────────────────────
2318
2319    #[test]
2320    fn on_write_matches_rebuild_byte_for_byte() {
2321        // Build a store incrementally via on_write, and a second identical store
2322        // via a single rebuild_all, then assert every index artifact is equal.
2323        let (_d1, wt) = mk_store();
2324        let (_d2, rb) = mk_store();
2325
2326        let docs: &[(&str, &str, &str, &str, &str)] = &[
2327            (
2328                "sources/emails/2026/05/e1.md",
2329                "email",
2330                "First mail",
2331                "2026-05-01T10:00:00Z",
2332                "tags:\n  - inbox\n",
2333            ),
2334            (
2335                "sources/emails/2026/06/e2.md",
2336                "email",
2337                "Second mail",
2338                "2026-06-01T10:00:00Z",
2339                "",
2340            ),
2341            (
2342                "records/contacts/sarah.md",
2343                "contact",
2344                "Sarah",
2345                "2026-05-15T10:00:00Z",
2346                "links:\n  - records/profiles/sarah\n",
2347            ),
2348            (
2349                "records/contacts/elena.md",
2350                "contact",
2351                "Elena",
2352                "2026-05-20T10:00:00Z",
2353                "status: active\n",
2354            ),
2355            (
2356                "records/profiles/sarah.md",
2357                "profile",
2358                "Sarah bio",
2359                "2026-05-21T10:00:00Z",
2360                "",
2361            ),
2362        ];
2363
2364        for (rel, t, sum, upd, extra) in docs {
2365            write_doc(&wt, rel, t, Some(sum), Some(upd), extra);
2366            write_doc(&rb, rel, t, Some(sum), Some(upd), extra);
2367            Index::on_write(&wt, Path::new(rel)).unwrap();
2368        }
2369        Index::rebuild_all(&rb).unwrap();
2370
2371        let a = snapshot_artifacts(&wt);
2372        let b = snapshot_artifacts(&rb);
2373        assert_eq!(
2374            a.keys().collect::<Vec<_>>(),
2375            b.keys().collect::<Vec<_>>(),
2376            "same set of index artifacts must exist"
2377        );
2378        for (k, v) in &a {
2379            assert_eq!(v, &b[k], "artifact {k} differs between write-through and rebuild:\n--- write-through ---\n{v}\n--- rebuild ---\n{}", b[k]);
2380        }
2381        // Sanity: artifacts actually exist (not a vacuous comparison of empties).
2382        assert!(a.contains_key("index.md"));
2383        assert!(a.contains_key("sources/emails/index.jsonl"));
2384        assert!(a.contains_key("records/contacts/index.md"));
2385    }
2386
2387    /// Regression (O(changed) bound, not just correctness): a loop op must
2388    /// recompute its parent rollups from the type-folder `index.jsonl` sidecars
2389    /// — never by walking the content tree of *sibling* folders it wasn't asked
2390    /// about. The byte-identity property test (which always indexes every folder
2391    /// before comparing) can't catch a violation, because a full-store walk
2392    /// produces the *correct* counts too; it just does so in `O(store files)`.
2393    ///
2394    /// The behavioral fingerprint of the old `update_parents → build_layer /
2395    /// build_root` (which called `walk_type_folder_files` on every type-folder in
2396    /// the store): a single `on_write` to `records/contacts/sarah.md` would
2397    /// surface, in the layer + root rollups, the file count of
2398    /// `records/companies` — a sibling that has content on disk but was NEVER
2399    /// passed to a write/index op, so it has no `index.jsonl`. An O(changed) loop
2400    /// op cannot "see" that un-indexed folder; a whole-store walk can. So this
2401    /// asserts the rollups reflect ONLY the sidecar-indexed folder, proving no
2402    /// content-tree walk happened.
2403    #[test]
2404    fn loop_op_does_not_walk_sibling_content_tree() {
2405        let (_d, store) = mk_store();
2406
2407        // A sibling type-folder with real content on disk, but deliberately
2408        // never indexed (no on_write / write_level / rebuild over it) ⇒ no
2409        // `records/companies/index.jsonl` exists.
2410        write_doc(
2411            &store,
2412            "records/companies/acme.md",
2413            "company",
2414            Some("Acme Inc"),
2415            Some("2026-05-05T00:00:00Z"),
2416            "",
2417        );
2418        write_doc(
2419            &store,
2420            "records/companies/globex.md",
2421            "company",
2422            Some("Globex"),
2423            Some("2026-05-06T00:00:00Z"),
2424            "",
2425        );
2426        assert!(
2427            !exists(&store, "records/companies/index.jsonl"),
2428            "precondition: companies must be un-indexed"
2429        );
2430
2431        // The ONLY loop op: a single write to a different type-folder.
2432        write_doc(
2433            &store,
2434            "records/contacts/sarah.md",
2435            "contact",
2436            Some("Sarah"),
2437            Some("2026-05-15T00:00:00Z"),
2438            "",
2439        );
2440        Index::on_write(&store, Path::new("records/contacts/sarah.md")).unwrap();
2441
2442        // The written folder is reflected in both rollups...
2443        let layer_md = read(&store, "records/index.md");
2444        let root_md = read(&store, "index.md");
2445        // (both rollups show counts only — no `## Folders` here, so no preview)
2446        assert!(
2447            layer_md.contains("- [[records/contacts/index|Contacts]] (1)\n")
2448                && !layer_md.contains("Sarah"),
2449            "layer must reflect the written folder, counts only:\n{layer_md}"
2450        );
2451        assert!(
2452            root_md.contains("- [[records/contacts/index|Contacts]] (1)\n"),
2453            "root must reflect the written folder:\n{root_md}"
2454        );
2455
2456        // ...but the un-indexed sibling must be INVISIBLE to a loop op. If the
2457        // rollups mention `records/companies` at all, `on_write` walked the whole
2458        // content tree — the O(store) regression.
2459        assert!(
2460            !layer_md.contains("companies"),
2461            "loop op walked the sibling content tree: layer rollup counts un-indexed records/companies\n{layer_md}"
2462        );
2463        assert!(
2464            !root_md.contains("companies"),
2465            "loop op walked the sibling content tree: root rollup counts un-indexed records/companies\n{root_md}"
2466        );
2467        // The layer's only child is contacts ⇒ its total is exactly 1, not 3.
2468        assert!(
2469            root_md.contains("## Records (1)"),
2470            "root layer total must count only the sidecar-indexed folder (1), not walked siblings (would be 3):\n{root_md}"
2471        );
2472
2473        // And the sidecar-derived count IS what a full walk WOULD yield once the
2474        // sibling is indexed too — i.e. the fix changes cost, not the eventual
2475        // result. Index companies, then confirm the rollups now (and only now)
2476        // include it, byte-identical to a from-scratch rebuild.
2477        let (_d2, rb) = mk_store();
2478        for (rel, t, s, u) in [
2479            (
2480                "records/companies/acme.md",
2481                "company",
2482                "Acme Inc",
2483                "2026-05-05T00:00:00Z",
2484            ),
2485            (
2486                "records/companies/globex.md",
2487                "company",
2488                "Globex",
2489                "2026-05-06T00:00:00Z",
2490            ),
2491            (
2492                "records/contacts/sarah.md",
2493                "contact",
2494                "Sarah",
2495                "2026-05-15T00:00:00Z",
2496            ),
2497        ] {
2498            write_doc(&rb, rel, t, Some(s), Some(u), "");
2499        }
2500        Index::on_write(&store, Path::new("records/companies/acme.md")).unwrap();
2501        Index::on_write(&store, Path::new("records/companies/globex.md")).unwrap();
2502        Index::rebuild_all(&rb).unwrap();
2503        let a = snapshot_artifacts(&store);
2504        let b = snapshot_artifacts(&rb);
2505        assert_eq!(
2506            a.keys().collect::<BTreeSet<_>>(),
2507            b.keys().collect::<BTreeSet<_>>(),
2508            "same artifact set after indexing both folders"
2509        );
2510        for (k, v) in &a {
2511            assert_eq!(
2512                v, &b[k],
2513                "after indexing the sibling too, loop result must equal rebuild for {k}"
2514            );
2515        }
2516        assert!(
2517            read(&store, "index.md").contains("## Records (3)"),
2518            "now that both folders are indexed, the root total is 3"
2519        );
2520    }
2521
2522    /// Regression: a type filed at the path the toolkit ITSELF computes
2523    /// (`Store::shard_path_for`) must be indexable end-to-end. The class of bug
2524    /// is a 2-component `<layer>/<file>` path, which `type_folder_of` treats as
2525    /// having no type-folder — making the producer (path computation) disagree
2526    /// with the consumer (index): the loop path crashes (`on_write` → `Err`, it
2527    /// tries to write `index.md` *inside* a file) while the sweep path silently
2528    /// drops the page from every catalog. A conclusion `profile` is a custom
2529    /// (non-built-in) type, so `shard_path_for` files it under the records-layer
2530    /// fallback `records/profile/<file>` — a conforming 3-component path. This test
2531    /// drives both paths through the real `shard_path_for` output and asserts
2532    /// (1) `on_write` succeeds, (2) the page appears in the rebuilt catalog, and
2533    /// (3) write-through == rebuild.
2534    #[test]
2535    fn custom_type_at_shard_path_for_is_indexable_end_to_end() {
2536        let (_d1, wt) = mk_store();
2537        let (_d2, rb) = mk_store();
2538
2539        // The toolkit's own canonical write path for a custom-type record.
2540        let rel = wt
2541            .shard_path_for(
2542                "profile",
2543                &crate::parser::Frontmatter::default(),
2544                "renewal-theme",
2545            )
2546            .unwrap();
2547        let rel_str = path_to_unix(&rel);
2548        // Guard the precondition the consumer requires: 3+ components so
2549        // `type_folder_of` resolves a real `<layer>/<type-folder>`.
2550        assert!(
2551            type_folder_of(&rel).is_some(),
2552            "shard_path_for produced a path the index cannot file: {rel_str}"
2553        );
2554
2555        write_doc(
2556            &wt,
2557            &rel_str,
2558            "profile",
2559            Some("Renewal theme"),
2560            Some("2026-05-21T10:00:00Z"),
2561            "",
2562        );
2563        write_doc(
2564            &rb,
2565            &rel_str,
2566            "profile",
2567            Some("Renewal theme"),
2568            Some("2026-05-21T10:00:00Z"),
2569            "",
2570        );
2571
2572        // (1) Loop path must NOT error (a 2-component `<layer>/<file>` shape
2573        // returned Err(Io(NotADirectory))).
2574        Index::on_write(&wt, &rel)
2575            .expect("on_write must succeed for a toolkit-computed custom-type path");
2576        Index::rebuild_all(&rb).unwrap();
2577
2578        // (2) The page is present in the rebuilt catalog (the old flat-path bug
2579        // silently omitted it from every artifact). The individual page link
2580        // lives in the *type-folder* index; the *layer* index rolls the
2581        // type-folder up — assert both, since the bug erased both. A custom
2582        // type's canonical folder is the records-layer fallback `records/profile`.
2583        let page_link = wiki_target(&rel); // records/profile/renewal-theme
2584        let tf_md = read(&rb, "records/profile/index.md");
2585        assert!(
2586            tf_md.contains(&format!("[[{page_link}]]")),
2587            "type-folder index must list the page link, got:\n{tf_md}"
2588        );
2589        assert!(
2590            exists(&rb, "records/profile/index.jsonl"),
2591            "type-folder jsonl must exist"
2592        );
2593        assert!(
2594            read(&rb, "records/profile/index.jsonl").contains(&rel_str),
2595            "type-folder jsonl must contain the page row"
2596        );
2597        // The layer index rolls the type-folder up (proves the page's folder is
2598        // visible to the layer catalog, not dropped).
2599        let layer_md = read(&rb, "records/index.md");
2600        assert!(
2601            layer_md.contains("records/profile/index"),
2602            "layer index must roll up the records/profile type-folder, got:\n{layer_md}"
2603        );
2604
2605        // (3) Write-through equals rebuild byte-for-byte — loop and sweep agree.
2606        let a = snapshot_artifacts(&wt);
2607        let b = snapshot_artifacts(&rb);
2608        assert_eq!(
2609            a.keys().collect::<Vec<_>>(),
2610            b.keys().collect::<Vec<_>>(),
2611            "loop and sweep must produce the same artifact set"
2612        );
2613        for (k, v) in &a {
2614            assert_eq!(
2615                v, &b[k],
2616                "custom-type artifact {k} differs between on_write and rebuild"
2617            );
2618        }
2619    }
2620
2621    #[test]
2622    fn on_remove_then_rebuild_match_and_pull_in_next_over_cap() {
2623        let (_d1, wt) = mk_store();
2624        let (_d2, rb) = mk_store();
2625        let total = MD_CAP + 3; // 503 files; removing one keeps md full at 500
2626        let mut all_rels = Vec::new();
2627        for i in 0..total {
2628            let rel = format!("sources/emails/2026/05/m-{i:04}.md");
2629            // `updated` strictly increasing across i by varying both minute and second
2630            let updated = format!("2026-05-10T00:{:02}:{:02}Z", i / 60, i % 60);
2631            write_doc(
2632                &wt,
2633                &rel,
2634                "email",
2635                Some(&format!("mail {i}")),
2636                Some(&updated),
2637                "",
2638            );
2639            write_doc(
2640                &rb,
2641                &rel,
2642                "email",
2643                Some(&format!("mail {i}")),
2644                Some(&updated),
2645                "",
2646            );
2647            all_rels.push(rel);
2648        }
2649        // Build write-through index, then remove the single newest file.
2650        Index::rebuild_all(&wt).unwrap();
2651        let newest = &all_rels[total - 1]; // highest i = newest updated
2652        fs::remove_file(wt.root.join(newest)).unwrap();
2653        Index::on_remove(&wt, Path::new(newest)).unwrap();
2654
2655        // Rebuild side: same end state (file physically absent).
2656        fs::remove_file(rb.root.join(newest)).unwrap();
2657        Index::rebuild_all(&rb).unwrap();
2658
2659        let a = snapshot_artifacts(&wt);
2660        let b = snapshot_artifacts(&rb);
2661        for (k, v) in &a {
2662            assert_eq!(v, &b[k], "after remove, artifact {k} drifted from rebuild");
2663        }
2664
2665        // The md must still hold exactly 500 entries (the 501st got pulled in)
2666        // and the removed file must be gone from both artifacts.
2667        let md = read(&wt, "sources/emails/index.md");
2668        assert_eq!(md.lines().filter(|l| l.starts_with("- [[")).count(), MD_CAP);
2669        // Removed (newest) file is gone from the bare-path md and the .md jsonl.
2670        assert!(
2671            !md.contains(&format!("[[{}]]", wiki_target(Path::new(newest)))),
2672            "removed file must not be listed in md"
2673        );
2674        // The file previously at rank 501 (excluded under the cap) is `all_rels[2]`
2675        // — `updated` increases with index, so newest-first rank 500 = index 2.
2676        // After dropping the newest it shifts into the visible 500.
2677        let pulled_in = &all_rels[2];
2678        assert!(
2679            md.contains(&format!("[[{}]]", wiki_target(Path::new(pulled_in)))),
2680            "the 501st-most-recent must be pulled into the browse view after a removal"
2681        );
2682        assert!(
2683            md.contains(&format!("This folder has {} files.", total - 1)),
2684            "footer count must decrement:\n{}",
2685            md.lines().rev().take(4).collect::<Vec<_>>().join("\n")
2686        );
2687        let jsonl = read(&wt, "sources/emails/index.jsonl");
2688        assert_eq!(
2689            jsonl.lines().count(),
2690            total - 1,
2691            "jsonl loses exactly the removed file"
2692        );
2693        assert!(
2694            !jsonl.contains(&path_to_unix(Path::new(newest))),
2695            "removed file must be gone from the jsonl too"
2696        );
2697    }
2698
2699    #[test]
2700    fn on_rename_cross_folder_matches_rebuild() {
2701        let (_d1, wt) = mk_store();
2702        let (_d2, rb) = mk_store();
2703        // Seed both stores identically.
2704        let seed: &[(&str, &str, &str, &str)] = &[
2705            (
2706                "records/contacts/a.md",
2707                "contact",
2708                "A",
2709                "2026-05-01T00:00:00Z",
2710            ),
2711            (
2712                "records/contacts/b.md",
2713                "contact",
2714                "B",
2715                "2026-05-02T00:00:00Z",
2716            ),
2717            (
2718                "records/companies/x.md",
2719                "company",
2720                "X",
2721                "2026-05-03T00:00:00Z",
2722            ),
2723        ];
2724        for (rel, t, s, u) in seed {
2725            write_doc(&wt, rel, t, Some(s), Some(u), "");
2726            write_doc(&rb, rel, t, Some(s), Some(u), "");
2727        }
2728        Index::rebuild_all(&wt).unwrap();
2729
2730        // Rename contacts/b.md -> companies/b.md (cross type-folder). The file's
2731        // `type` changes to match its new folder, as a real `dbmd rename` would.
2732        let old = "records/contacts/b.md";
2733        let new = "records/companies/b.md";
2734        fs::create_dir_all(wt.root.join("records/companies")).unwrap();
2735        fs::rename(wt.root.join(old), wt.root.join(new)).unwrap();
2736        // (type stays "contact" here; index copies frontmatter verbatim — the
2737        // test only asserts placement + parity with rebuild.)
2738        Index::on_rename(&wt, Path::new(old), Path::new(new)).unwrap();
2739
2740        // Rebuild side: same end state.
2741        fs::create_dir_all(rb.root.join("records/companies")).unwrap();
2742        fs::rename(rb.root.join(old), rb.root.join(new)).unwrap();
2743        Index::rebuild_all(&rb).unwrap();
2744
2745        let a = snapshot_artifacts(&wt);
2746        let b = snapshot_artifacts(&rb);
2747        assert_eq!(a.keys().collect::<Vec<_>>(), b.keys().collect::<Vec<_>>());
2748        for (k, v) in &a {
2749            assert_eq!(v, &b[k], "rename: artifact {k} drifted from rebuild");
2750        }
2751        // Concretely: b is gone from contacts, present in companies.
2752        let contacts = read(&wt, "records/contacts/index.md");
2753        assert!(!contacts.contains("records/contacts/b]]"));
2754        let companies = read(&wt, "records/companies/index.md");
2755        assert!(companies.contains("[[records/companies/b]]"));
2756    }
2757
2758    #[test]
2759    fn on_write_updates_existing_entry_in_place() {
2760        let (_d, store) = mk_store();
2761        write_doc(
2762            &store,
2763            "records/contacts/a.md",
2764            "contact",
2765            Some("Original"),
2766            Some("2026-05-01T00:00:00Z"),
2767            "",
2768        );
2769        Index::on_write(&store, Path::new("records/contacts/a.md")).unwrap();
2770        // Edit the same file: new summary + newer updated.
2771        write_doc(
2772            &store,
2773            "records/contacts/a.md",
2774            "contact",
2775            Some("Revised"),
2776            Some("2026-05-09T00:00:00Z"),
2777            "",
2778        );
2779        Index::on_write(&store, Path::new("records/contacts/a.md")).unwrap();
2780
2781        let jsonl = read(&store, "records/contacts/index.jsonl");
2782        assert_eq!(
2783            jsonl.lines().count(),
2784            1,
2785            "upsert must not duplicate the line"
2786        );
2787        assert!(jsonl.contains("Revised"), "jsonl must reflect the update");
2788        assert!(
2789            !jsonl.contains("Original"),
2790            "stale line must be gone (compacted)"
2791        );
2792        let md = read(&store, "records/contacts/index.md");
2793        assert!(md.contains("- [[records/contacts/a]] — Revised\n"));
2794        assert!(
2795            md.contains("updated: 2026-05-09T00:00:00Z\n"),
2796            "index updated must track the newer member"
2797        );
2798    }
2799
2800    // ── dry-run + cleanup ────────────────────────────────────────────────
2801
2802    #[test]
2803    fn dry_run_emits_separators_and_writes_nothing() {
2804        let (_d, store) = mk_store();
2805        write_doc(
2806            &store,
2807            "sources/emails/2026/05/a.md",
2808            "email",
2809            Some("Mail"),
2810            Some("2026-05-01T00:00:00Z"),
2811            "",
2812        );
2813        let out = Index::render_dry_run(&store, &IndexLevel::TypeFolder("sources/emails".into()))
2814            .unwrap();
2815        assert!(
2816            out.contains("--- sources/emails/index.md ---\n"),
2817            "md separator:\n{out}"
2818        );
2819        assert!(
2820            out.contains("--- sources/emails/index.jsonl ---\n"),
2821            "jsonl separator:\n{out}"
2822        );
2823        assert!(
2824            out.contains("- [[sources/emails/2026/05/a]] — Mail"),
2825            "md body present"
2826        );
2827        // Nothing was written to disk.
2828        assert!(
2829            !exists(&store, "sources/emails/index.md"),
2830            "dry-run must not write"
2831        );
2832        assert!(
2833            !exists(&store, "sources/emails/index.jsonl"),
2834            "dry-run must not write"
2835        );
2836    }
2837
2838    #[test]
2839    fn cleanup_removes_noncanonical_and_empty_indexes() {
2840        let (_d, store) = mk_store();
2841        write_doc(
2842            &store,
2843            "sources/emails/2026/05/a.md",
2844            "email",
2845            Some("Mail"),
2846            Some("2026-05-01T00:00:00Z"),
2847            "",
2848        );
2849        // A stray index inside a date-shard (non-canonical) ...
2850        fs::write(
2851            store.root.join("sources/emails/2026/05/index.md"),
2852            "stale\n",
2853        )
2854        .unwrap();
2855        fs::write(
2856            store.root.join("sources/emails/2026/05/index.jsonl"),
2857            "stale\n",
2858        )
2859        .unwrap();
2860        // ... and an index in an empty type-folder.
2861        fs::create_dir_all(store.root.join("records/empty")).unwrap();
2862        fs::write(store.root.join("records/empty/index.md"), "stale\n").unwrap();
2863
2864        Index::cleanup(&store).unwrap();
2865
2866        assert!(
2867            !exists(&store, "sources/emails/2026/05/index.md"),
2868            "shard index must be deleted"
2869        );
2870        assert!(
2871            !exists(&store, "sources/emails/2026/05/index.jsonl"),
2872            "shard jsonl must be deleted"
2873        );
2874        assert!(
2875            !exists(&store, "records/empty/index.md"),
2876            "empty-folder index must be deleted"
2877        );
2878        // The canonical type-folder file itself is untouched by cleanup.
2879        assert!(exists(&store, "sources/emails/2026/05/a.md"));
2880    }
2881
2882    #[test]
2883    fn rebuild_deletes_stale_indexes_for_emptied_folders() {
2884        let (_d, store) = mk_store();
2885        write_doc(
2886            &store,
2887            "records/contacts/a.md",
2888            "contact",
2889            Some("A"),
2890            Some("2026-05-01T00:00:00Z"),
2891            "",
2892        );
2893        Index::rebuild_all(&store).unwrap();
2894        assert!(exists(&store, "records/contacts/index.md"));
2895        assert!(exists(&store, "records/index.md"));
2896        assert!(exists(&store, "index.md"));
2897
2898        // Empty the folder entirely, then rebuild: all three levels vanish.
2899        fs::remove_file(store.root.join("records/contacts/a.md")).unwrap();
2900        Index::rebuild_all(&store).unwrap();
2901        assert!(
2902            !exists(&store, "records/contacts/index.md"),
2903            "emptied type-folder index gone"
2904        );
2905        assert!(
2906            !exists(&store, "records/index.md"),
2907            "now-empty layer index gone"
2908        );
2909        assert!(!exists(&store, "index.md"), "now-empty root index gone");
2910    }
2911
2912    // ── randomized parity (property-style) ───────────────────────────────
2913
2914    #[test]
2915    fn property_writethrough_equals_rebuild_under_mixed_ops() {
2916        // Deterministic pseudo-random op sequence (no rand crate): a small LCG.
2917        let (_d1, wt) = mk_store();
2918        let (_d2, rb) = mk_store();
2919        let mut seed: u64 = 0x9E3779B97F4A7C15;
2920        let mut next = || {
2921            seed = seed
2922                .wrapping_mul(6364136223846793005)
2923                .wrapping_add(1442695040888963407);
2924            (seed >> 33) as u32
2925        };
2926
2927        let folders = ["sources/emails", "records/contacts", "records/profiles"];
2928        let types = ["email", "contact", "profile"];
2929        let mut live: Vec<String> = Vec::new(); // store-relative paths that exist
2930
2931        for step in 0..120u32 {
2932            let r = next();
2933            let op = r % 10;
2934            if op < 6 || live.is_empty() {
2935                // CREATE/UPDATE
2936                let fi = (next() as usize) % folders.len();
2937                let folder = folders[fi];
2938                let id = next() % 40;
2939                let rel = if folder == "sources/emails" {
2940                    let month = 5 + (id % 2); // shard across two months
2941                    format!("{folder}/2026/{month:02}/f-{id:02}.md")
2942                } else {
2943                    format!("{folder}/f-{id:02}.md")
2944                };
2945                // recency varies with step so order is meaningful + total
2946                let updated = format!(
2947                    "2026-05-{:02}T{:02}:{:02}:00Z",
2948                    1 + (step % 27),
2949                    step % 24,
2950                    id % 60
2951                );
2952                let extra = if id % 3 == 0 {
2953                    "tags:\n  - x\n  - y\n"
2954                } else {
2955                    ""
2956                };
2957                write_doc(
2958                    &wt,
2959                    &rel,
2960                    types[fi],
2961                    Some(&format!("sum {step}")),
2962                    Some(&updated),
2963                    extra,
2964                );
2965                write_doc(
2966                    &rb,
2967                    &rel,
2968                    types[fi],
2969                    Some(&format!("sum {step}")),
2970                    Some(&updated),
2971                    extra,
2972                );
2973                Index::on_write(&wt, Path::new(&rel)).unwrap();
2974                if !live.contains(&rel) {
2975                    live.push(rel);
2976                }
2977            } else if op < 8 {
2978                // REMOVE a live file
2979                let idx = (next() as usize) % live.len();
2980                let rel = live.remove(idx);
2981                fs::remove_file(wt.root.join(&rel)).unwrap();
2982                fs::remove_file(rb.root.join(&rel)).ok();
2983                Index::on_remove(&wt, Path::new(&rel)).unwrap();
2984            } else {
2985                // RENAME a live file within the same layer (new id, maybe new type-folder)
2986                let idx = (next() as usize) % live.len();
2987                let old = live[idx].clone();
2988                // pick a destination folder in the same layer-ish set
2989                let fi = (next() as usize) % folders.len();
2990                let folder = folders[fi];
2991                let id = 50 + (next() % 40);
2992                let new = if folder == "sources/emails" {
2993                    format!("{folder}/2026/05/f-{id:02}.md")
2994                } else {
2995                    format!("{folder}/f-{id:02}.md")
2996                };
2997                if new == old || live.contains(&new) {
2998                    continue;
2999                }
3000                fs::create_dir_all(wt.root.join(&new).parent().unwrap()).unwrap();
3001                fs::create_dir_all(rb.root.join(&new).parent().unwrap()).unwrap();
3002                fs::rename(wt.root.join(&old), wt.root.join(&new)).unwrap();
3003                fs::rename(rb.root.join(&old), rb.root.join(&new)).unwrap();
3004                Index::on_rename(&wt, Path::new(&old), Path::new(&new)).unwrap();
3005                live[idx] = new;
3006            }
3007        }
3008
3009        // Now rebuild the rb side from the shared end state and compare.
3010        Index::rebuild_all(&rb).unwrap();
3011        let a = snapshot_artifacts(&wt);
3012        let b = snapshot_artifacts(&rb);
3013        assert_eq!(
3014            a.keys().collect::<BTreeSet<_>>(),
3015            b.keys().collect::<BTreeSet<_>>(),
3016            "write-through and rebuild must produce the same set of artifacts"
3017        );
3018        for (k, v) in &a {
3019            assert_eq!(
3020                v, &b[k],
3021                "INVARIANT VIOLATED: artifact {k} differs after mixed ops\n--- write-through ---\n{v}\n--- rebuild ---\n{}",
3022                b[k]
3023            );
3024        }
3025        assert!(
3026            !a.is_empty(),
3027            "the run must have produced at least one artifact"
3028        );
3029    }
3030
3031    // ── regressions: cleanup must not delete user content ─────────────────
3032
3033    /// CRITICAL regression: a user content file named `index.md` inside a date
3034    /// shard (e.g. from a website/doc-export mirror) must SURVIVE `cleanup` /
3035    /// `rebuild_all`. The old filename-only match silently deleted it.
3036    #[test]
3037    fn cleanup_preserves_user_content_named_index_md_in_shard() {
3038        let (_d, store) = mk_store();
3039        // A real content record that merely happens to be named index.md.
3040        write_doc(
3041            &store,
3042            "sources/emails/2026/06/index.md",
3043            "email",
3044            Some("Important imported mail"),
3045            Some("2026-06-11T04:23:25Z"),
3046            "",
3047        );
3048        Index::cleanup(&store).unwrap();
3049        assert!(
3050            exists(&store, "sources/emails/2026/06/index.md"),
3051            "cleanup must not delete a user content file named index.md"
3052        );
3053        // A full rebuild (which runs cleanup first) must also preserve it.
3054        Index::rebuild_all(&store).unwrap();
3055        assert!(
3056            exists(&store, "sources/emails/2026/06/index.md"),
3057            "rebuild_all must not delete a user content file named index.md"
3058        );
3059        let kept = read(&store, "sources/emails/2026/06/index.md");
3060        assert!(
3061            kept.contains("Important imported mail"),
3062            "the user's record content must be intact"
3063        );
3064    }
3065
3066    /// HIGH regression: `cleanup` uses `min_depth(2)`, so the canonical
3067    /// type-folder-root `index.md`/`index.jsonl` are NOT deleted up front. A
3068    /// genuine generated catalog at the type-folder root survives a cleanup pass
3069    /// (it is only ever rewritten, or removed when the folder is truly empty).
3070    #[test]
3071    fn cleanup_keeps_canonical_type_folder_root_sidecars() {
3072        let (_d, store) = mk_store();
3073        write_doc(
3074            &store,
3075            "records/contacts/alice.md",
3076            "contact",
3077            Some("Alice"),
3078            Some("2026-05-01T00:00:00Z"),
3079            "",
3080        );
3081        Index::write_level(&store, &IndexLevel::TypeFolder("records/contacts".into())).unwrap();
3082        assert!(exists(&store, "records/contacts/index.md"));
3083        assert!(exists(&store, "records/contacts/index.jsonl"));
3084        Index::cleanup(&store).unwrap();
3085        assert!(
3086            exists(&store, "records/contacts/index.md"),
3087            "cleanup must keep the canonical type-folder index.md (non-empty folder)"
3088        );
3089        assert!(
3090            exists(&store, "records/contacts/index.jsonl"),
3091            "cleanup must keep the canonical type-folder index.jsonl (non-empty folder)"
3092        );
3093    }
3094
3095    // ── regression: write-through must not catalog index artifacts ────────
3096
3097    /// HIGH regression: routing a generated `index.md` through `on_write` (as
3098    /// `dbmd fm set records/contacts/index.md …` would) must NOT insert a phantom
3099    /// self-row — counts and bytes stay equal to a rebuild.
3100    #[test]
3101    fn on_write_ignores_index_artifact_no_phantom_row() {
3102        let (_d, store) = mk_store();
3103        write_doc(
3104            &store,
3105            "records/contacts/alice.md",
3106            "contact",
3107            Some("Alice"),
3108            Some("2026-05-01T00:00:00Z"),
3109            "",
3110        );
3111        Index::on_write(&store, Path::new("records/contacts/alice.md")).unwrap();
3112        let jsonl_before = read(&store, "records/contacts/index.jsonl");
3113        assert_eq!(jsonl_before.lines().count(), 1);
3114
3115        // Tamper: route the catalog file itself through on_write.
3116        Index::on_write(&store, Path::new("records/contacts/index.md")).unwrap();
3117
3118        let jsonl_after = read(&store, "records/contacts/index.jsonl");
3119        assert_eq!(
3120            jsonl_after.lines().count(),
3121            1,
3122            "on_write on index.md must not add a phantom self-row"
3123        );
3124        assert!(
3125            !jsonl_after.contains("\"type\":\"index\""),
3126            "the catalog artifact must never appear as a catalogued row"
3127        );
3128        // Root rollup count stays 1 (not inflated to 2).
3129        let root = read(&store, "index.md");
3130        assert!(
3131            root.contains("[[records/contacts/index|Contacts]] (1)"),
3132            "count must not inflate:\n{root}"
3133        );
3134    }
3135
3136    // ── regression: multi-line summary cannot inject a catalog line ───────
3137
3138    /// HIGH regression: a block-scalar summary spanning multiple lines must be
3139    /// collapsed to one line in the browse entry, so it cannot forge a standalone
3140    /// `- [[…]]` catalog line.
3141    #[test]
3142    fn multiline_summary_is_single_lined_in_index_md() {
3143        let (_d, store) = mk_store();
3144        // A YAML block scalar whose value embeds a forged-looking entry line.
3145        write_raw(
3146            &store,
3147            "records/notes/evil.md",
3148            "type: note\nupdated: 2026-06-10T00:00:00Z\nsummary: |-\n  legit first line\n  - [[records/secrets/fake|Click me]] — injected entry",
3149            "\nbody\n",
3150        );
3151        let idx = Index::build_type_folder(&store, Path::new("records/notes")).unwrap();
3152        let md = idx.to_markdown();
3153        // Exactly one browse entry line, and no embedded newline forging a second.
3154        let entry_lines = md.lines().filter(|l| l.starts_with("- [[")).count();
3155        assert_eq!(
3156            entry_lines, 1,
3157            "a multi-line summary must not produce extra entry lines:\n{md}"
3158        );
3159        assert!(
3160            md.contains(
3161                "- [[records/notes/evil]] — legit first line - [[records/secrets/fake|Click me]] — injected entry\n"
3162            ),
3163            "summary newlines must collapse to spaces inline:\n{md}"
3164        );
3165    }
3166
3167    // ── regression: writer/validator scalar coercion agreement ────────────
3168
3169    /// HIGH regression: an unquoted non-string scalar `summary`/`type`
3170    /// (`summary: 2026`, `type: true`) must be coerced to a string by the index
3171    /// writer exactly as `validate::scalar_string` does — so the index entry holds
3172    /// the real value (`2026`), not the `(no summary)` placeholder that produced a
3173    /// permanently-unfixable INDEX_SUMMARY_MISMATCH.
3174    #[test]
3175    fn non_string_scalar_summary_and_type_are_coerced_like_validator() {
3176        let (_d, store) = mk_store();
3177        write_raw(
3178            &store,
3179            "records/contacts/a.md",
3180            "type: contact\nupdated: 2026-05-01T00:00:00Z\nsummary: 2026",
3181            "\nbody\n",
3182        );
3183        let rec = record_from_file(
3184            &store.root.join("records/contacts/a.md"),
3185            PathBuf::from("records/contacts/a.md"),
3186        )
3187        .unwrap();
3188        // `summary: 2026` (YAML number) coerces to the string "2026", matching
3189        // the validator's `scalar_string` (Number -> n.to_string()).
3190        assert_eq!(rec.summary, "2026");
3191        assert_eq!(rec.type_, "contact");
3192
3193        // And the rendered index entry quotes the real value, not the placeholder.
3194        let idx = Index::build_type_folder(&store, Path::new("records/contacts")).unwrap();
3195        let md = idx.to_markdown();
3196        assert!(
3197            md.contains("- [[records/contacts/a]] — 2026\n"),
3198            "index entry must hold the coerced scalar, not the placeholder:\n{md}"
3199        );
3200
3201        // A boolean scalar type coerces to "true" (mirrors scalar_string(Bool)).
3202        write_raw(
3203            &store,
3204            "records/contacts/b.md",
3205            "type: true\nupdated: 2026-05-02T00:00:00Z\nsummary: hi",
3206            "\nbody\n",
3207        );
3208        let rec_b = record_from_file(
3209            &store.root.join("records/contacts/b.md"),
3210            PathBuf::from("records/contacts/b.md"),
3211        )
3212        .unwrap();
3213        assert_eq!(rec_b.type_, "true");
3214    }
3215
3216    // ── regression: non-UTF-8 body must not abort the projection ──────────
3217
3218    /// HIGH regression: a content file with valid-UTF-8 frontmatter but a
3219    /// non-UTF-8 byte in the BODY (a verbatim Latin-1 `sources/` import) must
3220    /// still project to an IndexRecord — `record_from_file` reads frontmatter
3221    /// without requiring the whole file to be UTF-8, so a stray byte can't abort
3222    /// `rebuild_all` / write-through for the entire store.
3223    #[test]
3224    fn non_utf8_body_does_not_abort_record_projection() {
3225        let (_d, store) = mk_store();
3226        let rel = "sources/emails/2026/06/x.md";
3227        let abs = store.root.join(rel);
3228        fs::create_dir_all(abs.parent().unwrap()).unwrap();
3229        // Valid-UTF-8 frontmatter; a raw 0xE9 (Latin-1 'é') in the body.
3230        let mut bytes: Vec<u8> =
3231            b"---\ntype: email\nupdated: 2026-06-11T00:00:00Z\nsummary: An imported email\n---\n\nCaf"
3232                .to_vec();
3233        bytes.push(0xE9);
3234        bytes.extend_from_slice(b" meeting notes\n");
3235        fs::write(&abs, bytes).unwrap();
3236
3237        let rec = record_from_file(&abs, PathBuf::from(rel))
3238            .expect("non-UTF-8 body must not abort the frontmatter read");
3239        assert_eq!(rec.summary, "An imported email");
3240        assert_eq!(rec.type_, "email");
3241
3242        // The full sweep indexes the folder rather than aborting the whole store.
3243        Index::rebuild_all(&store).unwrap();
3244        assert!(
3245            exists(&store, "sources/emails/index.jsonl"),
3246            "rebuild must produce the catalog despite a non-UTF-8 body byte"
3247        );
3248        assert!(
3249            read(&store, "sources/emails/index.jsonl").contains("An imported email"),
3250            "the record must be catalogued"
3251        );
3252    }
3253
3254    /// HIGH regression: a single malformed-YAML file must abort the rebuild
3255    /// loudly (not be silently skipped) — skipping it would leave the store in a
3256    /// permanently invalid state (`INDEX_MISSING_ENTRY` / `INDEX_JSONL_DESYNC`
3257    /// that no rebuild clears, since the validator enumerates members by
3258    /// filename, not by parseability) and would desync the rollups. The abort is
3259    /// safe because `cleanup` preserves the prior canonical catalogs
3260    /// (`min_depth(2)`), so an aborted rebuild leaves the existing sidecars
3261    /// intact and surfaces a clear error naming the file to fix.
3262    #[test]
3263    fn rebuild_aborts_on_malformed_file_and_keeps_prior_catalogs() {
3264        let (_d, store) = mk_store();
3265        write_doc(
3266            &store,
3267            "records/contacts/alice.md",
3268            "contact",
3269            Some("Alice"),
3270            Some("2026-05-01T00:00:00Z"),
3271            "",
3272        );
3273        write_doc(
3274            &store,
3275            "records/companies/acme.md",
3276            "company",
3277            Some("Acme"),
3278            Some("2026-05-02T00:00:00Z"),
3279            "",
3280        );
3281
3282        // A clean first rebuild establishes the canonical catalogs.
3283        Index::rebuild_all(&store).expect("clean rebuild succeeds");
3284        assert!(exists(&store, "records/contacts/index.jsonl"));
3285        assert!(exists(&store, "records/companies/index.jsonl"));
3286
3287        // Routine malformed file: unterminated quoted scalar.
3288        let bad = store.root.join("records/contacts/broken.md");
3289        fs::write(
3290            &bad,
3291            "---\ntype: contact\nsummary: \"unterminated\n---\nbody\n",
3292        )
3293        .unwrap();
3294
3295        // Must abort loudly — a silent skip leaves a file the validator requires
3296        // to be catalogued out of the index forever.
3297        Index::rebuild_all(&store)
3298            .expect_err("rebuild must abort, not silently skip, on a malformed file");
3299
3300        // The prior canonical catalogs survive the aborted rebuild: `cleanup`'s
3301        // `min_depth(2)` never deletes a type-folder's root-level sidecars, so a
3302        // mid-sweep abort leaves the existing indexes intact rather than wiped.
3303        assert!(
3304            exists(&store, "records/companies/index.jsonl"),
3305            "an aborted rebuild must not destroy a clean sibling folder's catalog"
3306        );
3307        assert!(
3308            exists(&store, "records/contacts/index.jsonl"),
3309            "an aborted rebuild must not destroy the affected folder's prior catalog"
3310        );
3311        let contacts_jsonl = read(&store, "records/contacts/index.jsonl");
3312        assert!(contacts_jsonl.contains("records/contacts/alice.md"));
3313    }
3314
3315    /// HIGH regression (problem B): `rebuild_all`'s rollup `(N)` counts must
3316    /// equal the catalogued `index.jsonl` record counts — never a raw `.md` walk
3317    /// that disagrees with the sidecar. The over-corrected skip-with-diagnostic
3318    /// build excluded a malformed file from `index.jsonl` while `build_layer` /
3319    /// `build_root` kept counting it via `walk_type_folder_files`, so a folder
3320    /// would show `Contacts (2)` in the root/layer rollups while its `index.jsonl`
3321    /// held only 1 record — and a single subsequent write-through (which derives
3322    /// `(N)` from the jsonl) rewrote it to `Contacts (1)`, making `rebuild_all`
3323    /// and write-through emit different bytes for the same state. With the loud
3324    /// abort, the only successful-rebuild states are fully consistent: every
3325    /// rollup `(N)` equals the catalogued record count AND equals what a
3326    /// write-through over the same files produces.
3327    #[test]
3328    fn rebuild_rollup_counts_equal_jsonl_records_and_write_through() {
3329        let (_d, store) = mk_store();
3330        // Two well-formed contacts: the rollups must read (2), matching the two
3331        // jsonl records — this is the count the skip-version inflated to a phantom
3332        // extra when a malformed sibling was present-but-uncatalogued.
3333        write_doc(
3334            &store,
3335            "records/contacts/alice.md",
3336            "contact",
3337            Some("Alice"),
3338            Some("2026-05-01T00:00:00Z"),
3339            "",
3340        );
3341        write_doc(
3342            &store,
3343            "records/contacts/bob.md",
3344            "contact",
3345            Some("Bob"),
3346            Some("2026-05-02T00:00:00Z"),
3347            "",
3348        );
3349        Index::rebuild_all(&store).expect("clean rebuild succeeds");
3350
3351        // The catalogued record set (index.jsonl) and the rollup (N) must agree.
3352        let jsonl_lines = read(&store, "records/contacts/index.jsonl")
3353            .lines()
3354            .filter(|l| !l.trim().is_empty())
3355            .count();
3356        assert_eq!(jsonl_lines, 2, "two well-formed files ⇒ two jsonl records");
3357        let layer_md = read(&store, "records/index.md");
3358        let root_md = read(&store, "index.md");
3359        assert!(
3360            layer_md.contains("- [[records/contacts/index|Contacts]] (2)"),
3361            "layer rollup (N) must equal the jsonl record count (2), not a raw .md walk:\n{layer_md}"
3362        );
3363        assert!(
3364            root_md.contains("- [[records/contacts/index|Contacts]] (2)\n")
3365                && root_md.contains("## Records (2)"),
3366            "root rollup (N)/layer total must equal the jsonl record count (2):\n{root_md}"
3367        );
3368
3369        // The decisive write-through == rebuild_all byte-identity check on the
3370        // SAME end state: a single on_write must not rewrite the rollups to a
3371        // different (N). Under the skip-version, rebuild_all's rollup walked the
3372        // raw .md tree while on_write derived (N) from the jsonl, so the two
3373        // diverged; the loud abort keeps both deriving (N) from the catalogued
3374        // records, so the bytes match exactly.
3375        let (_d2, wt) = mk_store();
3376        write_doc(
3377            &wt,
3378            "records/contacts/alice.md",
3379            "contact",
3380            Some("Alice"),
3381            Some("2026-05-01T00:00:00Z"),
3382            "",
3383        );
3384        write_doc(
3385            &wt,
3386            "records/contacts/bob.md",
3387            "contact",
3388            Some("Bob"),
3389            Some("2026-05-02T00:00:00Z"),
3390            "",
3391        );
3392        Index::on_write(&wt, Path::new("records/contacts/alice.md")).unwrap();
3393        Index::on_write(&wt, Path::new("records/contacts/bob.md")).unwrap();
3394
3395        let a = snapshot_artifacts(&wt);
3396        let b = snapshot_artifacts(&store);
3397        assert_eq!(
3398            a.keys().collect::<BTreeSet<_>>(),
3399            b.keys().collect::<BTreeSet<_>>(),
3400            "write-through and rebuild_all must produce the same artifact set"
3401        );
3402        for (k, v) in &a {
3403            assert_eq!(
3404                v, &b[k],
3405                "rollup bytes diverged between write-through and rebuild_all for {k} \
3406                 (a skip-version inflates rebuild_all's (N) above the jsonl record \
3407                 count, which write-through then rewrites):\n--- write-through ---\n{v}\n--- rebuild ---\n{}",
3408                b[k]
3409            );
3410        }
3411    }
3412
3413    /// MEDIUM regression: a non-UTF-8 path component must be lossily decoded
3414    /// (kept, with U+FFFD), not silently dropped — so the index key points at the
3415    /// file, not its parent directory. Unix-only (ext4 allows the filename; APFS
3416    /// rejects it at the VFS layer).
3417    #[cfg(unix)]
3418    #[test]
3419    fn non_utf8_path_component_is_kept_not_dropped() {
3420        use std::ffi::OsStr;
3421        use std::os::unix::ffi::OsStrExt;
3422        // sources/emails/caf\xE9.md — the leaf has a non-UTF-8 byte.
3423        let mut leaf = b"caf".to_vec();
3424        leaf.push(0xE9);
3425        leaf.extend_from_slice(b".md");
3426        let p = Path::new("sources/emails").join(OsStr::from_bytes(&leaf));
3427        let unix = path_to_unix(&p);
3428        // The leaf is preserved (lossy), so the path is NOT collapsed to the
3429        // parent directory "sources/emails".
3430        assert_ne!(
3431            unix, "sources/emails",
3432            "non-UTF-8 leaf must not be dropped, collapsing the path to its parent dir"
3433        );
3434        assert!(
3435            unix.starts_with("sources/emails/caf"),
3436            "the lossy leaf must remain under its folder: {unix}"
3437        );
3438    }
3439}