Skip to main content

dbmd_core/
index.rs

1//! `index` — the hierarchical content catalog.
2//!
3//! A uniform three-level tree: root + per-layer + per-type-folder. **Two
4//! artifacts per type-folder:** the human `index.md` (capped 500, recency
5//! browse) and the machine `index.jsonl` (complete, structured — one JSON
6//! object per file). Both read `summary` + key frontmatter fields + links
7//! directly from each file — there is no extraction logic here.
8//!
9//! **Maintained write-through** by the write commands ([`Index::on_write`] /
10//! [`Index::on_rename`] / [`Index::on_remove`] — the loop path, O(changed), no
11//! store walk); [`Index::rebuild_all`] is the from-scratch SWEEP repair.
12//!
13//! **Key invariant:** write-through must produce a byte-identical `index.md`
14//! and (post-compaction) `index.jsonl` to a full [`Index::rebuild_all`] over
15//! the same end state — the loop path can never drift from the repair path.
16//!
17//! # Implementation notes (deviations the reader should know)
18//!
19//! - **Self-contained, by design.** This module does its own shard-aware folder
20//!   walk, its own minimal frontmatter read, and its own atomic write, using
21//!   only `store.root` (a public field) and the `serde_norway` / `serde_json` /
22//!   `chrono` / `walkdir` crates rather than routing through the sibling
23//!   `store`/`parser` helpers ([`Store::walk_type_folder`],
24//!   [`Store::recent_in_type_folder`], [`parser::read_file`], …). The index has
25//!   to stamp a *deterministic* `updated:` and emit a *canonical, compacted*
26//!   `index.jsonl` (see the two notes below); keeping the read/walk/write local
27//!   is what makes the byte-identity invariant a true byte comparison, free of
28//!   any incidental formatting the shared readers might introduce. The public
29//!   signatures in `lib.rs` are untouched.
30//! - **Deterministic `updated:` on the index files themselves.** An index's own
31//!   `updated` frontmatter is derived as the max `updated` over the files it
32//!   catalogs (max over children for root/layer) — NOT wall-clock-now. This is
33//!   what makes the byte-identity invariant a *true* byte comparison: a
34//!   write-through write and a `rebuild_all` over the same end state stamp the
35//!   same value. (The SPEC's rendered examples show a wall-clock-looking value;
36//!   the conventions list only requires `updated: <RFC3339>`, and the
37//!   property-tested invariant dominates.)
38//! - **`index.jsonl` is always compacted.** Write-through rewrites the affected
39//!   type-folder's jsonl in canonical form (one current line per path, recency
40//!   order) rather than appending superseded/tombstone lines, so the jsonl is
41//!   byte-identical to `rebuild_all` *immediately* (a strictly stronger
42//!   guarantee than the SPEC's "post-compaction"). This keeps the loop cost at
43//!   one sidecar read + one rewrite per touched type-folder — O(folder), the
44//!   sanctioned loop primitive, never a whole-`Store::walk`.
45//! - **Root/layer entry styling** follows plan §index (`(N)` numeric counts;
46//!   layer headings in the root carry the layer's total count) which is more
47//!   specific than the SPEC's illustrative `(42 files)` prose example. Type
48//!   folders are listed alphabetically (a deterministic order a derived artifact
49//!   needs); `scope: type-folder` follows the conventions list, not the one
50//!   SPEC example that wrote `scope: folder`.
51
52use std::collections::BTreeMap;
53use std::fs;
54use std::io::Write as _;
55use std::path::{Path, PathBuf};
56
57use chrono::{DateTime, FixedOffset, SecondsFormat};
58use serde::{Deserialize, Serialize};
59use serde_json::Value;
60
61use crate::store::{Layer, Store};
62
63/// The browse-view cap for a type-folder `index.md`.
64const MD_CAP: usize = 500;
65
66/// Placeholder summary for a content file that has no `summary` frontmatter.
67/// The index never invents a real summary — that is `dbmd fm init`'s job; this
68/// marker is what `dbmd validate` keys off (`INDEX`-class issue).
69const MISSING_SUMMARY: &str = "(no summary)";
70
71/// The root `index.md` H1.
72const ROOT_TITLE: &str = "Knowledge base index";
73
74/// Which level of the catalog an [`Index`] represents.
75#[derive(Debug, Clone, PartialEq, Eq)]
76pub enum IndexLevel {
77    /// The store-wide root `index.md` (layers + per-type counts).
78    Root,
79    /// A layer `index.md` (every type-folder under one layer).
80    Layer(Layer),
81    /// A type-folder `index.md` + `index.jsonl` (every file in the folder).
82    TypeFolder(PathBuf),
83}
84
85/// One record in a type-folder's `index.jsonl` — the complete, structured twin
86/// of a single `index.md` browse entry.
87///
88/// `tags` are the document's flat labels; `links` are its concept/relationship
89/// wiki-link targets. Both are copied verbatim from the file — never inferred.
90/// `fields` holds the remaining type-specific frontmatter so the structured
91/// query path can filter on any key without opening the file.
92#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
93pub struct IndexRecord {
94    /// Store-relative path of the file (the upsert key; last-write-wins).
95    /// Serialized with forward slashes regardless of OS (see [`path_serde`]) so
96    /// the `index.jsonl` catalog is byte-portable across platforms.
97    #[serde(with = "path_serde")]
98    pub path: PathBuf,
99    /// The file's `type`.
100    #[serde(rename = "type")]
101    pub type_: String,
102    /// The file's `summary`.
103    pub summary: String,
104    /// The file's flat `tags`.
105    #[serde(default)]
106    pub tags: Vec<String>,
107    /// The file's concept/relationship wiki-link targets (store-relative).
108    #[serde(default)]
109    pub links: Vec<String>,
110    /// `created` timestamp.
111    pub created: Option<DateTime<FixedOffset>>,
112    /// `updated` timestamp (the recency key for the `index.md` cap order).
113    pub updated: Option<DateTime<FixedOffset>>,
114    /// Remaining type-specific frontmatter fields, verbatim.
115    #[serde(flatten)]
116    pub fields: BTreeMap<String, Value>,
117}
118
119/// A built (or being-built) catalog for one [`IndexLevel`], with both rendered
120/// artifacts available. Pure data until written via [`Index::write_level`].
121#[derive(Debug, Clone, PartialEq)]
122pub struct Index {
123    /// Which level this catalog is for.
124    pub level: IndexLevel,
125    /// The complete record set for this level (type-folder level; empty for
126    /// root/layer rollups, which carry only counts).
127    pub records: Vec<IndexRecord>,
128    /// Per-child counts for root/layer rollups (child path → file count).
129    pub child_counts: BTreeMap<PathBuf, usize>,
130}
131
132impl Index {
133    /// Build a type-folder catalog by aggregating across date-shards, producing
134    /// both artifacts. `index.md` selection is recency (updated desc, ties by
135    /// path asc; cap 500 with a `## More` footer over the cap); `index.jsonl`
136    /// holds every file. A file missing `summary` gets a placeholder + a
137    /// validate-detectable issue (the index never invents summaries).
138    pub fn build_type_folder(store: &Store, type_folder: &Path) -> crate::Result<Index> {
139        let rel = normalize_rel(type_folder);
140        let abs = store.root.join(&rel);
141        let mut records = Vec::new();
142        for file_abs in walk_type_folder_files(&abs) {
143            let rel_path =
144                rel_to_store(&store.root, &file_abs).expect("walked file is under the store root");
145            // Abort the build on a malformed file rather than skip it. A skipped
146            // file would still be a content member the validator requires to be
147            // catalogued (`validate::walk_content_files` enumerates by filename,
148            // not by parseability), so silently dropping it would leave the store
149            // in a permanently invalid state (`INDEX_MISSING_ENTRY` /
150            // `INDEX_JSONL_DESYNC` that no rebuild can clear) and would desync the
151            // rollups (`build_layer`/`build_root` count the raw `.md` files). The
152            // loud `?` is the right outcome: `cleanup` now preserves the prior
153            // canonical sidecars (`min_depth(2)`), so an aborted rebuild leaves
154            // the existing catalogs intact and the operator a clear error naming
155            // the file to fix — never a destroyed or silently-wrong index.
156            records.push(record_from_file(&file_abs, rel_path)?);
157        }
158        sort_records(&mut records);
159        Ok(Index {
160            level: IndexLevel::TypeFolder(rel),
161            records,
162            child_counts: BTreeMap::new(),
163        })
164    }
165
166    /// Build a layer catalog: every non-empty type-folder under the layer with
167    /// `(N)` counts and a newest-file `summary` preview (≤ 80 chars).
168    pub fn build_layer(store: &Store, layer: Layer) -> crate::Result<Index> {
169        let mut child_counts = BTreeMap::new();
170        for tf in type_folders_in_layer(store, layer) {
171            let abs = store.root.join(&tf);
172            let n = walk_type_folder_files(&abs).len();
173            if n > 0 {
174                child_counts.insert(tf, n);
175            }
176        }
177        Ok(Index {
178            level: IndexLevel::Layer(layer),
179            records: Vec::new(),
180            child_counts,
181        })
182    }
183
184    /// Build the store-wide root catalog: one heading per non-empty layer with
185    /// total count + bulleted per-type sub-entries with `(N)` counts.
186    pub fn build_root(store: &Store) -> crate::Result<Index> {
187        let mut child_counts = BTreeMap::new();
188        for layer in Layer::all() {
189            for tf in type_folders_in_layer(store, layer) {
190                let abs = store.root.join(&tf);
191                let n = walk_type_folder_files(&abs).len();
192                if n > 0 {
193                    child_counts.insert(tf, n);
194                }
195            }
196        }
197        Ok(Index {
198            level: IndexLevel::Root,
199            records: Vec::new(),
200            child_counts,
201        })
202    }
203
204    /// Render this catalog as a canonical `index.md`.
205    pub fn to_markdown(&self) -> String {
206        match &self.level {
207            IndexLevel::TypeFolder(folder) => self.render_type_folder_md(folder),
208            IndexLevel::Layer(layer) => self.render_layer_md(*layer),
209            IndexLevel::Root => self.render_root_md(),
210        }
211    }
212
213    /// Render this type-folder catalog as the complete `index.jsonl` (one JSON
214    /// object per file, stable key order so diffs stay minimal). Type-folder
215    /// level only — root and layer stay markdown rollups.
216    pub fn to_jsonl(&self) -> String {
217        let mut out = String::new();
218        for rec in &self.records {
219            // The record type derives a deterministic, sorted key order
220            // (declared fields first, then the flattened `fields` BTreeMap).
221            let line = serde_json::to_string(rec).expect("IndexRecord serializes");
222            out.push_str(&line);
223            out.push('\n');
224        }
225        out
226    }
227
228    // ── rendering helpers ────────────────────────────────────────────────
229
230    fn render_type_folder_md(&self, folder: &Path) -> String {
231        let folder_disp = path_to_unix(folder);
232        let updated = max_updated(self.records.iter().map(|r| r.updated.as_ref()));
233        let mut s = String::new();
234        s.push_str("---\n");
235        s.push_str("type: index\n");
236        s.push_str("scope: type-folder\n");
237        s.push_str(&format!("folder: {folder_disp}\n"));
238        if let Some(ts) = updated {
239            s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
240        }
241        s.push_str("---\n\n");
242        s.push_str(&format!("# {folder_disp}\n\n"));
243
244        let shown = self.records.len().min(MD_CAP);
245        for rec in self.records.iter().take(shown) {
246            s.push_str(&format_md_entry(rec));
247            s.push('\n');
248        }
249
250        if self.records.len() > MD_CAP {
251            let type_ = self.records.first().map(|r| r.type_.as_str()).unwrap_or("");
252            let layer = folder
253                .components()
254                .next()
255                .and_then(|c| c.as_os_str().to_str())
256                .unwrap_or("");
257            s.push('\n');
258            s.push_str(&more_footer(self.records.len(), type_, layer));
259        }
260        s
261    }
262
263    /// Store-less layer rollup: counts only, no preview / no derived `updated`
264    /// (a layer index needs each child's on-disk jsonl for those — see
265    /// [`render_layer_md_with_store`], the canonical path every disk write
266    /// uses). This pure-data render is structurally identical sans preview.
267    fn render_layer_md(&self, layer: Layer) -> String {
268        let layer_dir = layer_dir_name(layer);
269        let mut s = String::new();
270        s.push_str("---\n");
271        s.push_str("type: index\n");
272        s.push_str("scope: layer\n");
273        s.push_str(&format!("folder: {layer_dir}\n"));
274        s.push_str("---\n\n");
275        s.push_str(&format!("# {layer_dir}\n\n"));
276        for (tf, n) in &self.child_counts {
277            let tf_unix = path_to_unix(tf);
278            let display = capitalize(folder_basename(tf));
279            s.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n"));
280        }
281        s
282    }
283
284    /// Store-less root rollup: counts only (the canonical disk render adds a
285    /// derived `updated` — see [`render_root_md_with_store`]).
286    fn render_root_md(&self) -> String {
287        let mut s = String::new();
288        s.push_str("---\n");
289        s.push_str("type: index\n");
290        s.push_str("scope: root\n");
291        s.push_str("---\n\n");
292        s.push_str(&format!("# {ROOT_TITLE}\n"));
293        for layer in Layer::all() {
294            let layer_dir = layer_dir_name(layer);
295            let prefix = format!("{layer_dir}/");
296            let children: Vec<(&PathBuf, &usize)> = self
297                .child_counts
298                .iter()
299                .filter(|(tf, _)| path_to_unix(tf).starts_with(&prefix))
300                .collect();
301            if children.is_empty() {
302                continue;
303            }
304            let total: usize = children.iter().map(|(_, n)| **n).sum();
305            s.push('\n');
306            s.push_str(&format!("## {} ({total})\n", capitalize(layer_dir)));
307            for (tf, n) in children {
308                let tf_unix = path_to_unix(tf);
309                let display = capitalize(folder_basename(tf));
310                s.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n"));
311            }
312        }
313        s
314    }
315}
316
317// ─────────────────────────────────────────────────────────────────────────
318// Write-through + sweep (free functions on the impl block).
319// ─────────────────────────────────────────────────────────────────────────
320
321impl Index {
322    /// **Write-through (loop, O(changed)).** Upsert a new/updated content file.
323    /// Reads the affected type-folder's `index.jsonl` (the sanctioned per-folder
324    /// sidecar read — never a whole-store walk), applies the change, and
325    /// atomically rewrites that folder's `index.md` + `index.jsonl` plus the
326    /// parent layer + root rollups so the artifacts equal a `rebuild_all` over
327    /// the same end state.
328    pub fn on_write(store: &Store, file: &Path) -> crate::Result<()> {
329        let file_rel = normalize_rel(file);
330        // The generated catalog files are not content — never upsert one into
331        // itself. `build_type_folder`'s walk already excludes `index.md`
332        // (`walk_type_folder_files`); the loop path must apply the same
333        // exclusion or editing `index.md` via `fm set` inserts a phantom
334        // self-row, inflating every `(N)` count and breaking the
335        // write-through == rebuild byte-identity invariant.
336        if is_index_artifact(&file_rel) {
337            return Ok(());
338        }
339        let file_abs = store.root.join(&file_rel);
340        let folder = type_folder_of(&file_rel)
341            .ok_or_else(|| bad_index(&file_rel, "file is not inside a layer/type-folder"))?;
342        let record = record_from_file(&file_abs, file_rel.clone())?;
343
344        // Serialize the sidecar read-modify-write so concurrent sanctioned
345        // writes to this folder don't clobber each other's rows (lost update).
346        let _lock = FolderLock::acquire(&store.root.join(&folder));
347        let mut records = read_jsonl_records(&store.root.join(&folder).join("index.jsonl"))?;
348        records.retain(|r| r.path != record.path);
349        records.push(record);
350        sort_records(&mut records);
351
352        write_type_folder_artifacts(store, &folder, &records)?;
353        update_parents(store, &folder)?;
354        Ok(())
355    }
356
357    /// **Write-through (loop, O(changed)).** Move a file's entry between
358    /// type-folder indexes (or within, if the same folder) in both `index.md`
359    /// and `index.jsonl`, fixing counts on both sides.
360    pub fn on_rename(store: &Store, old: &Path, new: &Path) -> crate::Result<()> {
361        let old_rel = normalize_rel(old);
362        let new_rel = normalize_rel(new);
363        // Index artifacts are generated, not catalogued — a rename of/into one
364        // is not a content move (same reasoning as `on_write`). Skip rather than
365        // insert a phantom self-row.
366        if is_index_artifact(&old_rel) || is_index_artifact(&new_rel) {
367            return Ok(());
368        }
369        let old_folder = type_folder_of(&old_rel)
370            .ok_or_else(|| bad_index(&old_rel, "source is not inside a layer/type-folder"))?;
371        let new_folder = type_folder_of(&new_rel)
372            .ok_or_else(|| bad_index(&new_rel, "target is not inside a layer/type-folder"))?;
373
374        // Serialize the sidecar read-modify-write(s). For a cross-folder rename,
375        // lock BOTH folders, always in sorted order, so two renames touching the
376        // same pair can't deadlock. Held for the whole operation via RAII.
377        let _locks = lock_folders(store, &old_folder, &new_folder);
378
379        // Drop from the old folder.
380        let mut old_records =
381            read_jsonl_records(&store.root.join(&old_folder).join("index.jsonl"))?;
382        old_records.retain(|r| r.path != old_rel);
383
384        if old_folder == new_folder {
385            // Same folder: re-read the (now-renamed) file and upsert.
386            let record = record_from_file(&store.root.join(&new_rel), new_rel.clone())?;
387            old_records.retain(|r| r.path != record.path);
388            old_records.push(record);
389            sort_records(&mut old_records);
390            write_type_folder_artifacts(store, &old_folder, &old_records)?;
391            update_parents(store, &old_folder)?;
392            return Ok(());
393        }
394
395        // Cross-folder: write the trimmed old folder (or drop its indexes if
396        // now empty), then upsert into the new folder.
397        sort_records(&mut old_records);
398        write_type_folder_artifacts(store, &old_folder, &old_records)?;
399
400        let record = record_from_file(&store.root.join(&new_rel), new_rel.clone())?;
401        let mut new_records =
402            read_jsonl_records(&store.root.join(&new_folder).join("index.jsonl"))?;
403        new_records.retain(|r| r.path != record.path);
404        new_records.push(record);
405        sort_records(&mut new_records);
406        write_type_folder_artifacts(store, &new_folder, &new_records)?;
407
408        update_parents(store, &old_folder)?;
409        update_parents(store, &new_folder)?;
410        Ok(())
411    }
412
413    /// **Write-through (loop, O(changed)).** Drop a file's entry from both
414    /// `index.md` and `index.jsonl`; decrement counts; if the browse view drops
415    /// below the cap, the next-most-recent is already present in the complete
416    /// jsonl record set and re-renders into the md automatically.
417    pub fn on_remove(store: &Store, file: &Path) -> crate::Result<()> {
418        let file_rel = normalize_rel(file);
419        // Removing a generated catalog artifact is not a content removal; it has
420        // no row to drop (it was never catalogued). Skip, mirroring `on_write`.
421        if is_index_artifact(&file_rel) {
422            return Ok(());
423        }
424        let folder = type_folder_of(&file_rel)
425            .ok_or_else(|| bad_index(&file_rel, "file is not inside a layer/type-folder"))?;
426        // Serialize the sidecar read-modify-write (see `on_write`).
427        let _lock = FolderLock::acquire(&store.root.join(&folder));
428        let mut records = read_jsonl_records(&store.root.join(&folder).join("index.jsonl"))?;
429        let before = records.len();
430        records.retain(|r| r.path != file_rel);
431        if records.len() == before {
432            // Nothing to remove; still normalize the folder + parents so the
433            // artifacts stay canonical.
434        }
435        sort_records(&mut records);
436        write_type_folder_artifacts(store, &folder, &records)?;
437        update_parents(store, &folder)?;
438        Ok(())
439    }
440
441    /// **SWEEP repair.** Walk the store once and atomically (re)write root +
442    /// every non-empty layer + every non-empty type-folder `index.md` and
443    /// `index.jsonl` (compacting the jsonl). Also runs [`Index::cleanup`].
444    pub fn rebuild_all(store: &Store) -> crate::Result<()> {
445        Index::cleanup(store)?;
446        for layer in Layer::all() {
447            for tf in type_folders_in_layer(store, layer) {
448                let idx = Index::build_type_folder(store, &tf)?;
449                if idx.records.is_empty() {
450                    continue;
451                }
452                write_type_folder_artifacts(store, &tf, &idx.records)?;
453            }
454            let layer_idx = Index::build_layer(store, layer)?;
455            let layer_index_md = store.root.join(layer_dir_name(layer)).join("index.md");
456            if layer_idx.child_counts.is_empty() {
457                remove_if_exists(&layer_index_md)?;
458            } else {
459                write_atomic(
460                    &layer_index_md,
461                    render_layer_md_with_store(store, &layer_idx),
462                )?;
463            }
464        }
465        let root_idx = Index::build_root(store)?;
466        let root_index_md = store.root.join("index.md");
467        if root_idx.child_counts.is_empty() {
468            remove_if_exists(&root_index_md)?;
469        } else {
470            write_atomic(&root_index_md, render_root_md_with_store(store, &root_idx))?;
471        }
472        Ok(())
473    }
474
475    /// Rebuild ONE type-folder's `index.md`/`index.jsonl` from a fresh walk, then
476    /// cascade the new child count up to the layer and root rollups — so a
477    /// scoped `dbmd index rebuild --folder` leaves the hierarchy consistent,
478    /// exactly like `rebuild_all` and the loop-path `on_write` already do.
479    /// (Writing only the folder, as the CLI used to, left stale layer/root
480    /// counts that `validate` would then flag as an index desync.)
481    pub fn rebuild_folder(store: &Store, folder: &Path) -> crate::Result<()> {
482        Self::write_level(store, &IndexLevel::TypeFolder(folder.to_path_buf()))?;
483        update_parents(store, folder)
484    }
485
486    /// Atomically write a single level's artifact(s) to disk.
487    pub fn write_level(store: &Store, level: &IndexLevel) -> crate::Result<()> {
488        match level {
489            IndexLevel::TypeFolder(folder) => {
490                let idx = Index::build_type_folder(store, folder)?;
491                if idx.records.is_empty() {
492                    remove_if_exists(&store.root.join(folder).join("index.md"))?;
493                    remove_if_exists(&store.root.join(folder).join("index.jsonl"))?;
494                } else {
495                    write_type_folder_artifacts(store, folder, &idx.records)?;
496                }
497            }
498            IndexLevel::Layer(layer) => {
499                let idx = Index::build_layer(store, *layer)?;
500                let p = store.root.join(layer_dir_name(*layer)).join("index.md");
501                if idx.child_counts.is_empty() {
502                    remove_if_exists(&p)?;
503                } else {
504                    write_atomic(&p, render_layer_md_with_store(store, &idx))?;
505                }
506            }
507            IndexLevel::Root => {
508                let idx = Index::build_root(store)?;
509                let p = store.root.join("index.md");
510                if idx.child_counts.is_empty() {
511                    remove_if_exists(&p)?;
512                } else {
513                    write_atomic(&p, render_root_md_with_store(store, &idx))?;
514                }
515            }
516        }
517        Ok(())
518    }
519
520    /// Render the generated indexes to a string with `--- <path> ---`
521    /// separators instead of writing them (`--dry-run`).
522    pub fn render_dry_run(store: &Store, level: &IndexLevel) -> crate::Result<String> {
523        let mut out = String::new();
524        match level {
525            IndexLevel::TypeFolder(folder) => {
526                let idx = Index::build_type_folder(store, folder)?;
527                let md_path = path_to_unix(&folder.join("index.md"));
528                let jsonl_path = path_to_unix(&folder.join("index.jsonl"));
529                out.push_str(&format!("--- {md_path} ---\n"));
530                out.push_str(&idx.to_markdown());
531                out.push_str(&format!("--- {jsonl_path} ---\n"));
532                out.push_str(&idx.to_jsonl());
533            }
534            IndexLevel::Layer(layer) => {
535                let idx = Index::build_layer(store, *layer)?;
536                let md_path = format!("{}/index.md", layer_dir_name(*layer));
537                out.push_str(&format!("--- {md_path} ---\n"));
538                out.push_str(&render_layer_md_with_store(store, &idx));
539            }
540            IndexLevel::Root => {
541                let idx = Index::build_root(store)?;
542                out.push_str("--- index.md ---\n");
543                out.push_str(&render_root_md_with_store(store, &idx));
544            }
545        }
546        Ok(out)
547    }
548
549    /// Cleanup pass (part of [`Index::rebuild_all`]): delete `index.md` /
550    /// `index.jsonl` in non-canonical folders (date-shards that should carry
551    /// none). Symmetric with index creation.
552    ///
553    /// **Only deletes generated catalog artifacts, never user content.** Two
554    /// guards keep this from eating data:
555    /// - `min_depth(2)` so the walk starts *below* the type-folder root — the
556    ///   canonical `<type-folder>/index.md` + `index.jsonl` are never targeted
557    ///   here (they are rewritten by the per-folder builders, or removed only
558    ///   when the folder is genuinely empty, in the dedicated branch below). The
559    ///   old `min_depth(1)` deleted them up front, so a rebuild aborted by one
560    ///   malformed file left every type-folder catalog destroyed.
561    /// - [`is_deletable_catalog_artifact`] confirms a shard-level `index.md` is
562    ///   an actual generated catalog (or stale/garbage leftover), NOT a content
563    ///   file a user wrote at that name (e.g. `dbmd write …/index.md --type
564    ///   email`, plausible when mirroring a website/doc export). Matching by
565    ///   filename alone silently deleted such records on the next rebuild.
566    pub fn cleanup(store: &Store) -> crate::Result<()> {
567        for layer in Layer::all() {
568            let layer_dir = store.root.join(layer_dir_name(layer));
569            if !layer_dir.is_dir() {
570                continue;
571            }
572            for tf in type_folders_in_layer(store, layer) {
573                let tf_abs = store.root.join(&tf);
574                // Any generated index inside a shard (below the type-folder
575                // root) is non-canonical: delete it. Never touch a user content
576                // file that merely happens to be named index.md.
577                for entry in walkdir::WalkDir::new(&tf_abs)
578                    .min_depth(2)
579                    .into_iter()
580                    .filter_map(|e| e.ok())
581                {
582                    let p = entry.path();
583                    if is_index_artifact(p) && is_deletable_catalog_artifact(p) {
584                        remove_if_exists(p)?;
585                    }
586                }
587                // Empty type-folder → no index at its root either. Same content
588                // guard: an `index.md` here that is actually a user record (the
589                // only file in the folder) is preserved, not deleted.
590                if walk_type_folder_files(&tf_abs).is_empty() {
591                    let md = tf_abs.join("index.md");
592                    if is_deletable_catalog_artifact(&md) {
593                        remove_if_exists(&md)?;
594                    }
595                    remove_if_exists(&tf_abs.join("index.jsonl"))?;
596                }
597            }
598        }
599        Ok(())
600    }
601}
602
603// ─────────────────────────────────────────────────────────────────────────
604// Private free helpers — all self-contained, none call back into Store/parser.
605// ─────────────────────────────────────────────────────────────────────────
606
607/// Write both artifacts for a type-folder, or delete them if the folder is now
608/// empty. The single funnel both write-through and rebuild go through, so their
609/// output is byte-identical by construction.
610fn write_type_folder_artifacts(
611    store: &Store,
612    folder: &Path,
613    records: &[IndexRecord],
614) -> crate::Result<()> {
615    let folder_abs = store.root.join(folder);
616    let md_path = folder_abs.join("index.md");
617    let jsonl_path = folder_abs.join("index.jsonl");
618    if records.is_empty() {
619        remove_if_exists(&md_path)?;
620        remove_if_exists(&jsonl_path)?;
621        return Ok(());
622    }
623    let idx = Index {
624        level: IndexLevel::TypeFolder(folder.to_path_buf()),
625        records: records.to_vec(),
626        child_counts: BTreeMap::new(),
627    };
628    write_atomic(&md_path, idx.to_markdown())?;
629    write_atomic(&jsonl_path, idx.to_jsonl())?;
630    Ok(())
631}
632
633/// Re-render the layer + root rollups that sit above `folder` — the
634/// **loop path**, O(changed). Counts + previews come from the type-folders'
635/// on-disk `index.jsonl` sidecars ([`collect_child_stats`]), NOT from a
636/// content-tree walk: a single write reads one sidecar per type-folder (shared
637/// across the layer and root rollups) — never the millions of files under the
638/// shards. `build_layer` / `build_root` (which *do* walk the content tree) are
639/// reserved for the from-scratch sweeps ([`Index::rebuild_all`],
640/// [`Index::write_level`], [`Index::render_dry_run`]). The result is
641/// byte-identical to those builders because in the loop — exactly as in
642/// `rebuild_all` — every touched folder's jsonl is rewritten before its parents
643/// are rolled up, so the per-folder stat (`count` / `newest`) equals what a
644/// from-scratch walk would compute.
645fn update_parents(store: &Store, folder: &Path) -> crate::Result<()> {
646    // Read every relevant type-folder's sidecar EXACTLY ONCE into a stat cache
647    // (`count` + `newest` record), then render both rollups from the cache. The
648    // old path read each sidecar 2–3× per write — `child_counts_from_jsonl`
649    // (full parse just for a count) plus `render_layer_md_with_store` and
650    // `render_root_md_with_store` (each a full `read_jsonl_records` parse + sort
651    // just to take `.first()`). Because `index.jsonl` sidecars are uncapped and
652    // append-mostly, a single high-volume folder (months of ingested emails) made
653    // an UNRELATED tiny write reparse a multi-MB sidecar several times, turning
654    // the loop op into O(total store records) and violating the crate's O(changed)
655    // invariant (lib.rs). One streaming pass per sidecar, shared across both
656    // rollups, restores O(changed)-per-sidecar cost (and keeps the output
657    // byte-identical: `count` == `read_jsonl_records().len()` and `newest` is the
658    // same record `.first()` would yield).
659    let stats = collect_child_stats(store, &Layer::all())?;
660
661    let layer = folder
662        .components()
663        .next()
664        .and_then(|c| c.as_os_str().to_str())
665        .and_then(layer_from_dir_name);
666    if let Some(layer) = layer {
667        let p = store.root.join(layer_dir_name(layer)).join("index.md");
668        if layer_has_children(&stats, layer) {
669            write_atomic(&p, render_layer_md_from_stats(layer, &stats))?;
670        } else {
671            remove_if_exists(&p)?;
672        }
673    }
674    let rp = store.root.join("index.md");
675    if stats.values().any(|s| s.count > 0) {
676        write_atomic(&rp, render_root_md_from_stats(&stats))?;
677    } else {
678        remove_if_exists(&rp)?;
679    }
680    Ok(())
681}
682
683/// True if `layer` has at least one non-empty child type-folder in `stats`.
684fn layer_has_children(stats: &BTreeMap<PathBuf, FolderStat>, layer: Layer) -> bool {
685    let prefix = format!("{}/", layer_dir_name(layer));
686    stats
687        .iter()
688        .any(|(tf, s)| s.count > 0 && path_to_unix(tf).starts_with(&prefix))
689}
690
691/// Render a layer `index.md` from the prebuilt per-folder stat cache — each
692/// child's count + newest summary/updated come from its single cached sidecar
693/// read, so the rollup matches the folder artifacts exactly (write-through and
694/// rebuild alike) without re-reading any sidecar.
695fn render_layer_md_from_stats(layer: Layer, stats: &BTreeMap<PathBuf, FolderStat>) -> String {
696    let layer_dir = layer_dir_name(layer);
697    let prefix = format!("{layer_dir}/");
698    let mut max_upd: Option<DateTime<FixedOffset>> = None;
699    let mut entries = String::new();
700    for (tf, stat) in stats {
701        if stat.count == 0 || !path_to_unix(tf).starts_with(&prefix) {
702            continue;
703        }
704        let newest = stat.newest.as_ref();
705        if let Some(u) = newest.and_then(|r| r.updated) {
706            max_upd = Some(match max_upd {
707                Some(cur) if cur >= u => cur,
708                _ => u,
709            });
710        }
711        let tf_unix = path_to_unix(tf);
712        let display = capitalize(folder_basename(tf));
713        let preview = newest
714            .map(|r| truncate(&r.summary, 80))
715            .filter(|p| !p.is_empty() && p != MISSING_SUMMARY);
716        match preview {
717            Some(p) => entries.push_str(&format!(
718                "- [[{tf_unix}/index|{display}]] ({}) — {p}\n",
719                stat.count
720            )),
721            None => entries.push_str(&format!(
722                "- [[{tf_unix}/index|{display}]] ({})\n",
723                stat.count
724            )),
725        }
726    }
727    let mut s = String::new();
728    s.push_str("---\n");
729    s.push_str("type: index\n");
730    s.push_str("scope: layer\n");
731    s.push_str(&format!("folder: {layer_dir}\n"));
732    if let Some(ts) = max_upd {
733        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
734    }
735    s.push_str("---\n\n");
736    s.push_str(&format!("# {layer_dir}\n\n"));
737    s.push_str(&entries);
738    s
739}
740
741/// Render the root `index.md` from the prebuilt per-folder stat cache.
742fn render_root_md_from_stats(stats: &BTreeMap<PathBuf, FolderStat>) -> String {
743    let mut max_upd: Option<DateTime<FixedOffset>> = None;
744    for stat in stats.values() {
745        if stat.count == 0 {
746            continue;
747        }
748        if let Some(u) = stat.newest.as_ref().and_then(|r| r.updated) {
749            max_upd = Some(match max_upd {
750                Some(cur) if cur >= u => cur,
751                _ => u,
752            });
753        }
754    }
755    let mut s = String::new();
756    s.push_str("---\n");
757    s.push_str("type: index\n");
758    s.push_str("scope: root\n");
759    if let Some(ts) = max_upd {
760        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
761    }
762    s.push_str("---\n\n");
763    s.push_str(&format!("# {ROOT_TITLE}\n"));
764    for layer in Layer::all() {
765        let layer_dir = layer_dir_name(layer);
766        let prefix = format!("{layer_dir}/");
767        let children: Vec<(&PathBuf, usize)> = stats
768            .iter()
769            .filter(|(tf, s)| s.count > 0 && path_to_unix(tf).starts_with(&prefix))
770            .map(|(tf, s)| (tf, s.count))
771            .collect();
772        if children.is_empty() {
773            continue;
774        }
775        let total: usize = children.iter().map(|(_, n)| *n).sum();
776        s.push('\n');
777        s.push_str(&format!("## {} ({total})\n", capitalize(layer_dir)));
778        for (tf, n) in children {
779            let tf_unix = path_to_unix(tf);
780            let display = capitalize(folder_basename(tf));
781            s.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n"));
782        }
783    }
784    s
785}
786
787/// Render a layer `index.md`, reading each child's newest summary + max-updated
788/// straight from its on-disk `index.jsonl` (so the rollup matches the folder
789/// artifacts exactly, write-through and rebuild alike). The **sweep-path**
790/// renderer used by [`Index::rebuild_all`] / [`Index::write_level`] /
791/// [`Index::render_dry_run`]; the loop path uses the cache-based
792/// [`render_layer_md_from_stats`] to avoid re-reading sidecars.
793fn render_layer_md_with_store(store: &Store, idx: &Index) -> String {
794    let layer = match idx.level {
795        IndexLevel::Layer(l) => l,
796        _ => unreachable!("render_layer_md_with_store called on non-layer"),
797    };
798    let layer_dir = layer_dir_name(layer);
799    let mut max_upd: Option<DateTime<FixedOffset>> = None;
800    let mut entries = String::new();
801    for (tf, n) in &idx.child_counts {
802        let recs = read_jsonl_records(&store.root.join(tf).join("index.jsonl")).unwrap_or_default();
803        let newest = recs.first();
804        if let Some(u) = newest.and_then(|r| r.updated) {
805            max_upd = Some(match max_upd {
806                Some(cur) if cur >= u => cur,
807                _ => u,
808            });
809        }
810        let tf_unix = path_to_unix(tf);
811        let display = capitalize(folder_basename(tf));
812        let preview = newest
813            .map(|r| truncate(&r.summary, 80))
814            .filter(|p| !p.is_empty() && p != MISSING_SUMMARY);
815        match preview {
816            Some(p) => entries.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n}) — {p}\n")),
817            None => entries.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n")),
818        }
819    }
820    let mut s = String::new();
821    s.push_str("---\n");
822    s.push_str("type: index\n");
823    s.push_str("scope: layer\n");
824    s.push_str(&format!("folder: {layer_dir}\n"));
825    if let Some(ts) = max_upd {
826        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
827    }
828    s.push_str("---\n\n");
829    s.push_str(&format!("# {layer_dir}\n\n"));
830    s.push_str(&entries);
831    s
832}
833
834/// Render the root `index.md`, taking each child's max-updated from its on-disk
835/// `index.jsonl`. The **sweep-path** renderer (the loop path uses
836/// [`render_root_md_from_stats`]).
837fn render_root_md_with_store(store: &Store, idx: &Index) -> String {
838    let mut max_upd: Option<DateTime<FixedOffset>> = None;
839    for tf in idx.child_counts.keys() {
840        let recs = read_jsonl_records(&store.root.join(tf).join("index.jsonl")).unwrap_or_default();
841        if let Some(u) = recs.first().and_then(|r| r.updated) {
842            max_upd = Some(match max_upd {
843                Some(cur) if cur >= u => cur,
844                _ => u,
845            });
846        }
847    }
848    let mut s = String::new();
849    s.push_str("---\n");
850    s.push_str("type: index\n");
851    s.push_str("scope: root\n");
852    if let Some(ts) = max_upd {
853        s.push_str(&format!("updated: {}\n", fmt_ts(&ts)));
854    }
855    s.push_str("---\n\n");
856    s.push_str(&format!("# {ROOT_TITLE}\n"));
857    for layer in Layer::all() {
858        let layer_dir = layer_dir_name(layer);
859        let prefix = format!("{layer_dir}/");
860        let children: Vec<(&PathBuf, &usize)> = idx
861            .child_counts
862            .iter()
863            .filter(|(tf, _)| path_to_unix(tf).starts_with(&prefix))
864            .collect();
865        if children.is_empty() {
866            continue;
867        }
868        let total: usize = children.iter().map(|(_, n)| **n).sum();
869        s.push('\n');
870        s.push_str(&format!("## {} ({total})\n", capitalize(layer_dir)));
871        for (tf, n) in children {
872            let tf_unix = path_to_unix(tf);
873            let display = capitalize(folder_basename(tf));
874            s.push_str(&format!("- [[{tf_unix}/index|{display}]] ({n})\n"));
875        }
876    }
877    s
878}
879
880/// One `index.md` browse line: `- [[path]] — summary  ·  #tag #tag` (the
881/// `  ·  #…` suffix omitted when the file has no tags). The wiki-link target is
882/// the canonical **bare** store-relative path (no `.md` extension — the
883/// doctrine the writers emit and `validate` enforces via
884/// `WIKI_LINK_HAS_EXTENSION`); the jsonl `path` keeps the real on-disk name.
885fn format_md_entry(rec: &IndexRecord) -> String {
886    let path = wiki_target(&rec.path);
887    // Collapse the summary to a single line before interpolating it into the
888    // one-line browse entry. A hand-written file may legally carry a YAML block
889    // scalar (`summary: |-`) whose value spans multiple lines; rendered verbatim
890    // those embedded newlines break the line-oriented `index.md` format and can
891    // forge a standalone catalog entry (`\n- [[…|Click me]] — injected`). The
892    // CLI writers already collapse whitespace; do the same here so the spec's
893    // primary write path (agents writing files directly) can't corrupt the
894    // catalog. Single-line normalization matches `truncate`'s rule (the
895    // layer/root rollups already single-line the same summary via `truncate`).
896    let summary = collapse_whitespace(&rec.summary);
897    let mut line = format!("- [[{path}]] — {summary}");
898    if !rec.tags.is_empty() {
899        let tags = rec
900            .tags
901            .iter()
902            .map(|t| format!("#{t}"))
903            .collect::<Vec<_>>()
904            .join(" ");
905        line.push_str(&format!("  ·  {tags}"));
906    }
907    line
908}
909
910/// The deterministic `## More` footer for an over-cap type-folder.
911fn more_footer(total: usize, type_: &str, layer: &str) -> String {
912    format!(
913        "## More\n\nThis folder has {total} files. The {MD_CAP} most recent are listed above.\nUse `dbmd index query --type {type_} --in {layer}` for the complete catalog.\n"
914    )
915}
916
917/// Canonical total order: `updated` descending (None sorts last), ties broken
918/// by store-relative path ascending. A *total* order, so write-through and
919/// rebuild never disagree on #500 vs #501.
920fn sort_records(records: &mut [IndexRecord]) {
921    records.sort_by(record_recency_cmp);
922}
923
924impl IndexRecord {
925    /// Build the [`IndexRecord`] a freshly-rebuilt `index.jsonl` *should* hold
926    /// for the file at `abs` (catalogued under store-relative `rel`).
927    ///
928    /// This is the single canonical projection from frontmatter → sidecar
929    /// record: [`Index::build_type_folder`] uses the same path to write the
930    /// jsonl, so the validator can rebuild the expected record here and compare
931    /// it field-for-field against the committed line — covering **every**
932    /// queryable/dedup field the query path reads (`summary`, `type`, `tags`,
933    /// `links`, `created`, `updated`, and every type-specific `fields` entry
934    /// like `email` / `domain` / `company` / `amount` / `vendor`) without the
935    /// validator hand-rolling (and drifting from) the projection per field.
936    pub(crate) fn expected_from_file(abs: &Path, rel: PathBuf) -> crate::Result<IndexRecord> {
937        record_from_file(abs, rel)
938    }
939}
940
941/// Build an [`IndexRecord`] from a file on disk. Missing `summary` →
942/// [`MISSING_SUMMARY`] placeholder (the index never invents a summary).
943fn record_from_file(abs: &Path, rel: PathBuf) -> crate::Result<IndexRecord> {
944    let mut meta = read_frontmatter(abs)?;
945    // Records carry an effective `meta-type` in the catalog: the declared value
946    // (already spilled into `fields` by `read_frontmatter`), or the default
947    // `fact` when absent — so `--where meta-type=fact` sees un-annotated records.
948    // Sources are evidence and carry no meta-type.
949    if rel.starts_with("records") {
950        meta.fields
951            .entry("meta-type".to_string())
952            .or_insert_with(|| Value::String("fact".to_string()));
953    }
954    Ok(IndexRecord {
955        path: rel,
956        type_: meta.type_.unwrap_or_default(),
957        summary: meta.summary.unwrap_or_else(|| MISSING_SUMMARY.to_string()),
958        tags: meta.tags,
959        links: meta.links,
960        created: meta.created,
961        updated: meta.updated,
962        fields: meta.fields,
963    })
964}
965
966/// The slice of a frontmatter this module needs.
967struct FileMeta {
968    type_: Option<String>,
969    summary: Option<String>,
970    tags: Vec<String>,
971    links: Vec<String>,
972    created: Option<DateTime<FixedOffset>>,
973    updated: Option<DateTime<FixedOffset>>,
974    fields: BTreeMap<String, Value>,
975}
976
977/// Minimal frontmatter read: split the leading `---`…`---` block and parse it
978/// as YAML, extracting the typed fields and spilling the rest into `fields`.
979/// Self-contained (does not route through the `parser` module).
980///
981/// **Body bytes are never required to be UTF-8.** `sources/` is "preserved
982/// verbatim" per the SPEC and routinely carries non-UTF-8 imports (Latin-1
983/// emails dropped in by `rsync`/`mbsync`/`cp`); the body can hold any byte. We
984/// read the file as raw bytes and lossily decode *only* the leading frontmatter
985/// region, so a stray non-UTF-8 byte in the body can never abort the projection
986/// (the old `fs::read_to_string` failed on the first such byte anywhere in the
987/// file, taking a whole `rebuild_all` / write-through down with it). The
988/// frontmatter itself is expected to be UTF-8; if it isn't, `U+FFFD` markers
989/// surface in the parsed values rather than a hard abort.
990fn read_frontmatter(abs: &Path) -> crate::Result<FileMeta> {
991    let bytes = fs::read(abs)?;
992    let yaml = extract_frontmatter_block_lossy(&bytes).unwrap_or_default();
993    let map: serde_norway::Mapping = if yaml.trim().is_empty() {
994        serde_norway::Mapping::new()
995    } else {
996        serde_norway::from_str(&yaml).map_err(|e| {
997            crate::Error::Store(crate::store::StoreError::BadTypeIndex {
998                path: abs.to_path_buf(),
999                message: format!("frontmatter YAML: {e}"),
1000            })
1001        })?
1002    };
1003
1004    let mut type_ = None;
1005    let mut summary = None;
1006    let mut tags = Vec::new();
1007    let mut links = Vec::new();
1008    let mut created = None;
1009    let mut updated = None;
1010    let mut fields = BTreeMap::new();
1011
1012    for (k, v) in map {
1013        let key = match k.as_str() {
1014            Some(s) => s.to_string(),
1015            None => continue,
1016        };
1017        match key.as_str() {
1018            // `type` and `summary` are coerced with the SAME scalar rule the
1019            // validator applies (`validate::scalar_string`: String/Number/Bool →
1020            // string). A bare `v.as_str()` returns `None` for an unquoted numeric
1021            // or boolean scalar (`summary: 2026`, `type: true`), so the index
1022            // would write the `(no summary)` / empty-type placeholder while
1023            // `dbmd validate` reads the file as HAVING that summary/type —
1024            // yielding a permanently-unfixable `INDEX_SUMMARY_MISMATCH` (every
1025            // rebuild reproduces the same mismatched placeholder). Coercing here
1026            // keeps the writer and the validator byte-for-byte in agreement.
1027            "type" => type_ = scalar_string(&v),
1028            "summary" => summary = scalar_string(&v),
1029            "tags" => tags = yaml_string_list(&v),
1030            "links" => links = yaml_string_list(&v),
1031            "created" => created = v.as_str().and_then(parse_ts),
1032            "updated" => updated = v.as_str().and_then(parse_ts),
1033            // `path`, `type`, `summary`, `tags`, `links`, `created`, `updated`
1034            // are the reserved IndexRecord keys; everything else (including
1035            // `id`, `status`, type-specific fields) goes to `fields`.
1036            "path" => {}
1037            _ => {
1038                fields.insert(key, yaml_to_json_value(&v));
1039            }
1040        }
1041    }
1042
1043    Ok(FileMeta {
1044        type_,
1045        summary,
1046        tags,
1047        links,
1048        created,
1049        updated,
1050        fields,
1051    })
1052}
1053
1054/// A YAML scalar (`String`/`Number`/`Bool`) rendered as a string; `None` for
1055/// sequences/mappings/null. **Must stay identical to `validate::scalar_string`**
1056/// so the index writer and the validator coerce `type`/`summary` the same way
1057/// (see [`read_frontmatter`]); an unquoted `summary: 2026` becomes `"2026"` in
1058/// both, not a placeholder here and a real value there.
1059fn scalar_string(v: &serde_norway::Value) -> Option<String> {
1060    match v {
1061        serde_norway::Value::String(s) => Some(s.clone()),
1062        serde_norway::Value::Number(n) => Some(n.to_string()),
1063        serde_norway::Value::Bool(b) => Some(b.to_string()),
1064        _ => None,
1065    }
1066}
1067
1068/// Lossily decode the leading frontmatter region of a file given its raw bytes,
1069/// then pull the YAML between the opening `---` and the next `---`. Only the
1070/// frontmatter region needs to be valid UTF-8 in practice; the body may carry
1071/// arbitrary bytes (a verbatim `sources/` import). Returns `None` when the file
1072/// has no frontmatter fence at its very start.
1073fn extract_frontmatter_block_lossy(bytes: &[u8]) -> Option<String> {
1074    // Decode lossily so a non-UTF-8 body byte never aborts the read. The
1075    // frontmatter is at the very start of the file, so a lossy whole-file decode
1076    // is correct for extracting it (and cheap relative to the YAML parse). A
1077    // leading UTF-8 BOM is stripped by `extract_frontmatter_block`.
1078    let text = String::from_utf8_lossy(bytes);
1079    extract_frontmatter_block(&text)
1080}
1081
1082/// Pull the YAML between a leading `---` line and the next `---` line. Returns
1083/// `None` when the file has no frontmatter fence at its very start.
1084fn extract_frontmatter_block(text: &str) -> Option<String> {
1085    let trimmed = text.strip_prefix('\u{feff}').unwrap_or(text);
1086    let mut lines = trimmed.lines();
1087    let first = lines.next()?;
1088    if first.trim_end() != "---" {
1089        return None;
1090    }
1091    let mut block = String::new();
1092    for line in lines {
1093        if line.trim_end() == "---" {
1094            return Some(block);
1095        }
1096        block.push_str(line);
1097        block.push('\n');
1098    }
1099    None // no closing fence
1100}
1101
1102/// Read a string scalar or a sequence-of-string-scalars into a `Vec<String>`.
1103/// Wiki-link items keep their `[[…]]` form verbatim.
1104fn yaml_string_list(v: &serde_norway::Value) -> Vec<String> {
1105    match v {
1106        serde_norway::Value::String(s) => vec![s.clone()],
1107        serde_norway::Value::Sequence(seq) => seq
1108            .iter()
1109            .filter_map(yaml_string_or_wiki_link_literal)
1110            .collect(),
1111        _ => Vec::new(),
1112    }
1113}
1114
1115fn yaml_string_or_wiki_link_literal(v: &serde_norway::Value) -> Option<String> {
1116    v.as_str()
1117        .map(str::to_string)
1118        .or_else(|| unquoted_wiki_link_literal(v))
1119}
1120
1121fn yaml_to_json_value(v: &serde_norway::Value) -> Value {
1122    if let Some(link) = unquoted_wiki_link_literal(v) {
1123        return Value::String(link);
1124    }
1125    match v {
1126        serde_norway::Value::String(s) => Value::String(s.clone()),
1127        serde_norway::Value::Bool(b) => Value::Bool(*b),
1128        serde_norway::Value::Number(n) => {
1129            serde_json::to_value(n).unwrap_or_else(|_| Value::String(n.to_string()))
1130        }
1131        serde_norway::Value::Sequence(seq) => {
1132            Value::Array(seq.iter().map(yaml_to_json_value).collect())
1133        }
1134        serde_norway::Value::Mapping(_) | serde_norway::Value::Tagged(_) => {
1135            serde_json::to_value(v).unwrap_or(Value::Null)
1136        }
1137        serde_norway::Value::Null => Value::Null,
1138    }
1139}
1140
1141fn unquoted_wiki_link_literal(v: &serde_norway::Value) -> Option<String> {
1142    let serde_norway::Value::Sequence(outer) = v else {
1143        return None;
1144    };
1145    if outer.len() != 1 {
1146        return None;
1147    }
1148    let serde_norway::Value::Sequence(inner) = &outer[0] else {
1149        return None;
1150    };
1151    let [serde_norway::Value::String(target)] = inner.as_slice() else {
1152        return None;
1153    };
1154    Some(format!("[[{target}]]"))
1155}
1156
1157/// Parse an RFC3339 timestamp scalar.
1158fn parse_ts(s: &str) -> Option<DateTime<FixedOffset>> {
1159    DateTime::parse_from_rfc3339(s.trim()).ok()
1160}
1161
1162/// Render a timestamp the same way `serde_json` renders an `IndexRecord`
1163/// timestamp (RFC3339, `Z` for UTC, sub-seconds preserved) so the md
1164/// frontmatter and the jsonl agree byte-for-byte.
1165fn fmt_ts(ts: &DateTime<FixedOffset>) -> String {
1166    ts.to_rfc3339_opts(SecondsFormat::AutoSi, true)
1167}
1168
1169/// Max `updated` over an iterator of optional timestamps.
1170fn max_updated<'a>(
1171    it: impl Iterator<Item = Option<&'a DateTime<FixedOffset>>>,
1172) -> Option<DateTime<FixedOffset>> {
1173    let mut best: Option<DateTime<FixedOffset>> = None;
1174    for ts in it.flatten() {
1175        best = Some(match best {
1176            Some(cur) if cur >= *ts => cur,
1177            _ => *ts,
1178        });
1179    }
1180    best
1181}
1182
1183/// Read a type-folder's `index.jsonl` into records, applying last-write-wins by
1184/// `path` over any un-compacted lines (so a half-compacted jsonl still reads
1185/// cleanly). Missing file → empty set. Returns records in canonical order.
1186fn read_jsonl_records(jsonl: &Path) -> crate::Result<Vec<IndexRecord>> {
1187    let text = match fs::read_to_string(jsonl) {
1188        Ok(t) => t,
1189        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
1190        Err(e) => return Err(e.into()),
1191    };
1192    // Last-write-wins by path; preserve only the final occurrence.
1193    let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
1194    for (i, line) in text.lines().enumerate() {
1195        if line.trim().is_empty() {
1196            continue;
1197        }
1198        let rec: IndexRecord = serde_json::from_str(line).map_err(|e| {
1199            crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1200                path: jsonl.to_path_buf(),
1201                message: format!("line {}: {e}", i + 1),
1202            })
1203        })?;
1204        by_path.insert(rec.path.clone(), rec);
1205    }
1206    let mut records: Vec<IndexRecord> = by_path.into_values().collect();
1207    sort_records(&mut records);
1208    Ok(records)
1209}
1210
1211/// The minimal rollup stat a parent index needs from one type-folder's
1212/// `index.jsonl`: how many distinct files it catalogs (`count`) and the single
1213/// newest record (`newest`, the recency-sorted `.first()` — its `updated` feeds
1214/// the parent's derived `updated`, its `summary` the layer preview). Holding the
1215/// newest record alone, rather than the whole sidecar, is what keeps a rollup
1216/// recompute cheap regardless of how large the sidecar grows.
1217#[derive(Debug, Clone, Default, PartialEq)]
1218struct FolderStat {
1219    count: usize,
1220    newest: Option<IndexRecord>,
1221}
1222
1223/// Read a type-folder's `index.jsonl` ONCE and reduce it to a [`FolderStat`]:
1224/// distinct-`path` count (last-write-wins) plus the recency-newest record. A
1225/// missing sidecar is the default (`count: 0`, `newest: None`). This is the
1226/// **loop-path** rollup primitive — one streaming pass per sidecar, never the
1227/// content tree and never the 2–3× full reparse the old
1228/// `jsonl_record_count` + `read_jsonl_records` pair did. `count` is
1229/// byte-identical to [`read_jsonl_records`]`.len()` and `newest` to its
1230/// `.first()`, so a rollup built from these stats matches the from-scratch
1231/// builders byte-for-byte.
1232fn read_folder_stat(jsonl: &Path) -> crate::Result<FolderStat> {
1233    let text = match fs::read_to_string(jsonl) {
1234        Ok(t) => t,
1235        Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(FolderStat::default()),
1236        Err(e) => return Err(e.into()),
1237    };
1238    // Last-write-wins by path, exactly like `read_jsonl_records`, so count and
1239    // newest are computed over the same compacted record set.
1240    let mut by_path: BTreeMap<PathBuf, IndexRecord> = BTreeMap::new();
1241    for (i, line) in text.lines().enumerate() {
1242        if line.trim().is_empty() {
1243            continue;
1244        }
1245        let rec: IndexRecord = serde_json::from_str(line).map_err(|e| {
1246            crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1247                path: jsonl.to_path_buf(),
1248                message: format!("line {}: {e}", i + 1),
1249            })
1250        })?;
1251        by_path.insert(rec.path.clone(), rec);
1252    }
1253    let count = by_path.len();
1254    // The newest record is the minimum under `sort_records`' order (updated
1255    // desc, None last, ties by path asc) — i.e. what `.first()` returns. Find it
1256    // with a single min-scan instead of sorting the whole set.
1257    let newest = by_path.into_values().min_by(record_recency_cmp);
1258    Ok(FolderStat { count, newest })
1259}
1260
1261/// The total order [`sort_records`] imposes, as a comparator over two records:
1262/// `updated` descending (None last), ties broken by store-relative path
1263/// ascending. Kept in one place so `read_folder_stat`'s min-scan agrees with the
1264/// sort byte-for-byte on which record is "newest".
1265fn record_recency_cmp(a: &IndexRecord, b: &IndexRecord) -> std::cmp::Ordering {
1266    match (b.updated, a.updated) {
1267        (Some(bu), Some(au)) => bu.cmp(&au),
1268        (Some(_), None) => std::cmp::Ordering::Greater, // a is None → after b
1269        (None, Some(_)) => std::cmp::Ordering::Less,    // b is None → after a
1270        (None, None) => std::cmp::Ordering::Equal,
1271    }
1272    .then_with(|| a.path.cmp(&b.path))
1273}
1274
1275/// Per-child rollup stats for `layers`, read from each type-folder's on-disk
1276/// `index.jsonl` (one [`read_folder_stat`] pass each) rather than walked from the
1277/// content tree. The **loop-path** counterpart to the from-scratch counting in
1278/// [`Index::build_layer`] / [`Index::build_root`]: it keeps [`update_parents`]
1279/// `O(type-folders)` sidecar reads so a single write never re-enumerates the
1280/// whole store, and reuses one read per sidecar across BOTH the layer and root
1281/// rollups. Empty folders (`count == 0`) are kept out of the map.
1282fn collect_child_stats(
1283    store: &Store,
1284    layers: &[Layer],
1285) -> crate::Result<BTreeMap<PathBuf, FolderStat>> {
1286    let mut stats = BTreeMap::new();
1287    for &layer in layers {
1288        for tf in type_folders_in_layer(store, layer) {
1289            let stat = read_folder_stat(&store.root.join(&tf).join("index.jsonl"))?;
1290            if stat.count > 0 {
1291                stats.insert(tf, stat);
1292            }
1293        }
1294    }
1295    Ok(stats)
1296}
1297
1298/// Walk a type-folder's `.md` content files, recursing through date-shards,
1299/// excluding the `index.md` artifact itself and any hidden entries.
1300fn walk_type_folder_files(folder_abs: &Path) -> Vec<PathBuf> {
1301    let mut out = Vec::new();
1302    if !folder_abs.is_dir() {
1303        return out;
1304    }
1305    for entry in walkdir::WalkDir::new(folder_abs)
1306        .into_iter()
1307        .filter_entry(|e| !is_hidden(e.file_name()))
1308        .filter_map(|e| e.ok())
1309    {
1310        if !entry.file_type().is_file() {
1311            continue;
1312        }
1313        let p = entry.path();
1314        if p.extension().and_then(|e| e.to_str()) != Some("md") {
1315            continue;
1316        }
1317        if p.file_name().and_then(|n| n.to_str()) == Some("index.md") {
1318            continue;
1319        }
1320        out.push(p.to_path_buf());
1321    }
1322    out
1323}
1324
1325/// The immediate type-folders under a layer (one directory level below the
1326/// layer dir), as store-relative paths. Hidden dirs and `log/` are skipped.
1327fn type_folders_in_layer(store: &Store, layer: Layer) -> Vec<PathBuf> {
1328    let layer_dir = store.root.join(layer_dir_name(layer));
1329    let mut out = Vec::new();
1330    let rd = match fs::read_dir(&layer_dir) {
1331        Ok(rd) => rd,
1332        Err(_) => return out,
1333    };
1334    for entry in rd.flatten() {
1335        if !entry.path().is_dir() {
1336            continue;
1337        }
1338        let name = entry.file_name();
1339        let name = match name.to_str() {
1340            Some(n) => n,
1341            None => continue,
1342        };
1343        if is_hidden(entry.file_name().as_os_str()) || name == "log" {
1344            continue;
1345        }
1346        out.push(PathBuf::from(layer_dir_name(layer)).join(name));
1347    }
1348    out.sort();
1349    out
1350}
1351
1352/// The type-folder a content file belongs to: `<layer>/<type>` (the first two
1353/// path components), or `None` if the path is not under a known layer with at
1354/// least a type segment.
1355fn type_folder_of(file_rel: &Path) -> Option<PathBuf> {
1356    let mut comps = file_rel.components();
1357    let layer = comps.next()?.as_os_str().to_str()?;
1358    layer_from_dir_name(layer)?;
1359    let type_seg = comps.next()?.as_os_str().to_str()?;
1360    Some(PathBuf::from(layer).join(type_seg))
1361}
1362
1363/// Convert an absolute path under `root` to a store-relative path.
1364fn rel_to_store(root: &Path, abs: &Path) -> Option<PathBuf> {
1365    abs.strip_prefix(root).ok().map(|p| p.to_path_buf())
1366}
1367
1368/// Normalize a possibly-absolute or `./`-prefixed path to a clean
1369/// store-relative form (drops a leading `./`; leaves already-relative paths).
1370fn normalize_rel(p: &Path) -> PathBuf {
1371    let s = path_to_unix(p);
1372    let s = s.strip_prefix("./").unwrap_or(&s);
1373    PathBuf::from(s)
1374}
1375
1376fn is_index_artifact(p: &Path) -> bool {
1377    matches!(
1378        p.file_name().and_then(|n| n.to_str()),
1379        Some("index.md") | Some("index.jsonl")
1380    )
1381}
1382
1383/// True when a file named `index.md` / `index.jsonl` is safe for [`Index::cleanup`]
1384/// to delete — i.e. it is a generated catalog artifact (or a stale/garbage
1385/// leftover from a previous build), NOT a user content file that merely happens
1386/// to be named `index.md`.
1387///
1388/// - `index.jsonl` is always a machine artifact (content files are `.md`), so it
1389///   is always deletable.
1390/// - `index.md` is deletable UNLESS it parses as a content file — frontmatter
1391///   whose `type` is some real record type (anything other than `index`). A
1392///   generated catalog carries `type: index`; a user record carries its own type
1393///   (`email`, `note`, …) and must be preserved (deleting it is silent,
1394///   unrecoverable data loss). A leftover with no/garbage frontmatter (e.g. a
1395///   bare `stale\n`) is treated as a deletable stale artifact.
1396fn is_deletable_catalog_artifact(p: &Path) -> bool {
1397    match p.file_name().and_then(|n| n.to_str()) {
1398        Some("index.jsonl") => true,
1399        Some("index.md") => match read_frontmatter(p) {
1400            // Real content file (non-`index` type) → preserve, never delete.
1401            Ok(meta) => meta.type_.as_deref().is_none_or(|t| t == "index"),
1402            // Unreadable / no frontmatter → a stale or garbage artifact, deletable.
1403            Err(_) => true,
1404        },
1405        _ => false,
1406    }
1407}
1408
1409fn is_hidden(name: &std::ffi::OsStr) -> bool {
1410    name.to_str().map(|s| s.starts_with('.')).unwrap_or(false)
1411}
1412
1413fn layer_dir_name(layer: Layer) -> &'static str {
1414    match layer {
1415        Layer::Sources => "sources",
1416        Layer::Records => "records",
1417    }
1418}
1419
1420/// Local layer-name parse. Mirrors the contract of [`Layer::from_dir_name`];
1421/// kept local to keep this module's walk self-contained (see the module header).
1422fn layer_from_dir_name(name: &str) -> Option<Layer> {
1423    match name {
1424        "sources" => Some(Layer::Sources),
1425        "records" => Some(Layer::Records),
1426        _ => None,
1427    }
1428}
1429
1430/// The final path component as a `&str` (folder basename).
1431fn folder_basename(p: &Path) -> &str {
1432    p.file_name().and_then(|n| n.to_str()).unwrap_or("")
1433}
1434
1435/// The canonical wiki-link target for a content path: the store-relative path
1436/// with `/` separators and the trailing `.md` stripped (the bare form the
1437/// `index.md` browse view links to).
1438fn wiki_target(p: &Path) -> String {
1439    let unix = path_to_unix(p);
1440    unix.strip_suffix(".md").unwrap_or(&unix).to_string()
1441}
1442
1443/// Render a path with `/` separators regardless of host OS, so artifacts are
1444/// identical on every platform.
1445///
1446/// A non-UTF-8 path component (reachable on Linux/ext4, db.md's primary
1447/// deployment target, where `sources/` files arrive verbatim from Latin-1
1448/// exports) is decoded **lossily** with `U+FFFD` markers rather than silently
1449/// dropped. The old `filter_map(|c| c.as_os_str().to_str())` dropped any bad
1450/// component entirely, so `sources/emails/caf\xe9.md` serialized as
1451/// `sources/emails` — a path pointing at the *directory*, not the file, that
1452/// also collapsed distinct files onto one `index.jsonl` key. Lossy decoding
1453/// keeps the leaf present and visibly marked.
1454fn path_to_unix(p: &Path) -> String {
1455    p.components()
1456        .map(|c| c.as_os_str().to_string_lossy().into_owned())
1457        .collect::<Vec<_>>()
1458        .join("/")
1459}
1460
1461/// Serde for [`IndexRecord::path`]: always forward-slash on the wire, so the
1462/// `index.jsonl` catalog is identical whether the store was written on POSIX or
1463/// Windows (a git clone across OSes yields the same paths, and the last-write-
1464/// wins upsert key never splits on separator style). On POSIX this matches the
1465/// default `PathBuf` serialization; on Windows it rewrites `\` to `/`.
1466mod path_serde {
1467    use super::path_to_unix;
1468    use serde::{Deserialize, Deserializer, Serializer};
1469    use std::path::{Path, PathBuf};
1470
1471    pub fn serialize<S: Serializer>(p: &Path, s: S) -> Result<S::Ok, S::Error> {
1472        s.serialize_str(&path_to_unix(p))
1473    }
1474
1475    pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<PathBuf, D::Error> {
1476        Ok(PathBuf::from(String::deserialize(d)?))
1477    }
1478}
1479
1480/// ASCII-capitalize the first character.
1481fn capitalize(s: &str) -> String {
1482    let mut chars = s.chars();
1483    match chars.next() {
1484        Some(first) => first.to_uppercase().collect::<String>() + chars.as_str(),
1485        None => String::new(),
1486    }
1487}
1488
1489/// Collapse all runs of whitespace (including newlines) into single spaces and
1490/// trim the ends — the single-line normalization both the `index.md` browse
1491/// entry ([`format_md_entry`]) and the rollup preview ([`truncate`]) share, so a
1492/// multi-line block-scalar summary can never inject a newline into either.
1493fn collapse_whitespace(s: &str) -> String {
1494    s.split_whitespace().collect::<Vec<_>>().join(" ")
1495}
1496
1497/// Truncate to at most `max` chars (char-boundary safe), single-line.
1498fn truncate(s: &str, max: usize) -> String {
1499    let one_line = collapse_whitespace(s);
1500    if one_line.chars().count() <= max {
1501        one_line
1502    } else {
1503        one_line.chars().take(max).collect()
1504    }
1505}
1506
1507/// Atomic (rename-based) write for the **derived** catalog (`index.md` /
1508/// `index.jsonl`). Deliberately NOT `fsync`-durable like [`crate::fsx`]: the
1509/// index is rebuildable (`dbmd index rebuild`) and this is the O(changed)
1510/// write-through path, so a per-write `fsync` would be cost without benefit — a
1511/// crash-lost catalog write is recovered by a rebuild, not data loss. (Primary
1512/// data — content records, `log.md` — uses the durable `crate::fsx` path.)
1513fn write_atomic(path: &Path, contents: String) -> crate::Result<()> {
1514    if let Some(parent) = path.parent() {
1515        fs::create_dir_all(parent)?;
1516    }
1517    let dir = path.parent().unwrap_or_else(|| Path::new("."));
1518    let mut tmp = tempfile_in(dir)?;
1519    tmp.write_all(contents.as_bytes())?;
1520    tmp.flush()?;
1521    tmp.persist(path)?;
1522    Ok(())
1523}
1524
1525fn remove_if_exists(path: &Path) -> crate::Result<()> {
1526    match fs::remove_file(path) {
1527        Ok(()) => Ok(()),
1528        Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(()),
1529        Err(e) => Err(e.into()),
1530    }
1531}
1532
1533fn bad_index(path: &Path, msg: &str) -> crate::Error {
1534    crate::Error::Store(crate::store::StoreError::BadTypeIndex {
1535        path: path.to_path_buf(),
1536        message: msg.to_string(),
1537    })
1538}
1539
1540/// Per-type-folder advisory lock for the write-through sidecar read-modify-write.
1541///
1542/// The write-through update of a folder's `index.jsonl`/`index.md` is a
1543/// read-snapshot → modify → atomic-rename-over-whole-file sequence. The SPEC
1544/// sanctions many-writer concurrency for `records/` (`dbmd write` is
1545/// `create_new`-race-safe for the *content* file), but two concurrent writers to
1546/// the SAME type-folder would each read the same sidecar snapshot, add only their
1547/// own row, and rename their whole file over the other's — a classic lost update,
1548/// dropping most rows until a manual `dbmd index rebuild`. This lock serializes
1549/// the per-folder RMW (the content file is already serialized by `create_new`),
1550/// so concurrent sanctioned writes each see the other's row.
1551///
1552/// Implementation: a hidden `<type-folder>/.index.lock` acquired via `create_new`
1553/// (the same O_EXCL primitive `cmd/write.rs` uses), bounded-spin with a small
1554/// sleep, and stale-lock breaking by mtime age so a crashed writer can't wedge
1555/// the folder forever. The dotfile name keeps it out of the content walk
1556/// (`walk_type_folder_files` skips hidden) and out of `cleanup`
1557/// (`is_index_artifact` only matches `index.md`/`index.jsonl`). RAII: the lock is
1558/// released (file removed) on drop, including on the error paths.
1559struct FolderLock {
1560    path: PathBuf,
1561    held: bool,
1562}
1563
1564impl FolderLock {
1565    /// Acquire the lock for `folder_abs`. Spins (with a short sleep) up to a
1566    /// bounded number of attempts, breaking a lock older than the staleness
1567    /// window so a crash can't deadlock the folder. Best-effort: if the lock
1568    /// genuinely can't be taken (extremely rare contention), it proceeds
1569    /// unlocked rather than failing the write — degrading to the prior behavior
1570    /// instead of erroring a sanctioned operation.
1571    fn acquire(folder_abs: &Path) -> Self {
1572        use std::time::{Duration, SystemTime};
1573        const MAX_ATTEMPTS: u32 = 600; // ~6s at 10ms/attempt
1574        const SPIN: Duration = Duration::from_millis(10);
1575        const STALE_AFTER: Duration = Duration::from_secs(30);
1576
1577        let path = folder_abs.join(".index.lock");
1578        // Ensure the folder exists so the lockfile create can succeed.
1579        let _ = fs::create_dir_all(folder_abs);
1580        for _ in 0..MAX_ATTEMPTS {
1581            match fs::OpenOptions::new()
1582                .write(true)
1583                .create_new(true)
1584                .open(&path)
1585            {
1586                Ok(_) => {
1587                    return FolderLock { path, held: true };
1588                }
1589                Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
1590                    // Break a stale lock left by a crashed writer.
1591                    if let Ok(meta) = fs::metadata(&path) {
1592                        if let Ok(modified) = meta.modified() {
1593                            if SystemTime::now()
1594                                .duration_since(modified)
1595                                .map(|age| age > STALE_AFTER)
1596                                .unwrap_or(false)
1597                            {
1598                                let _ = fs::remove_file(&path);
1599                                continue;
1600                            }
1601                        }
1602                    }
1603                    std::thread::sleep(SPIN);
1604                }
1605                // Any other error (e.g. permissions): give up on locking and
1606                // proceed unlocked rather than failing the write.
1607                Err(_) => return FolderLock { path, held: false },
1608            }
1609        }
1610        // Contention budget exhausted: proceed unlocked (best-effort).
1611        FolderLock { path, held: false }
1612    }
1613}
1614
1615impl Drop for FolderLock {
1616    fn drop(&mut self) {
1617        if self.held {
1618            let _ = fs::remove_file(&self.path);
1619        }
1620    }
1621}
1622
1623/// Acquire the write-through lock for one or two type-folders. When `a == b`
1624/// (same-folder rename) only one lock is taken. For two distinct folders the
1625/// locks are always acquired in sorted order so a pair of concurrent renames
1626/// touching the same two folders can't deadlock by grabbing them in opposite
1627/// orders. Returns the guard(s); drop releases them.
1628fn lock_folders(store: &Store, a: &Path, b: &Path) -> Vec<FolderLock> {
1629    if a == b {
1630        return vec![FolderLock::acquire(&store.root.join(a))];
1631    }
1632    let (first, second) = if a < b { (a, b) } else { (b, a) };
1633    vec![
1634        FolderLock::acquire(&store.root.join(first)),
1635        FolderLock::acquire(&store.root.join(second)),
1636    ]
1637}
1638
1639// A tiny atomic-write helper. `tempfile` is a dev-dependency for tests; for
1640// the library path we hand-roll a temp-file-then-rename so writes are atomic
1641// without pulling `tempfile` into the non-dev dependency set. The file handle
1642// is held in an `Option` so `persist` can take it out without fighting the
1643// `Drop` impl (which only cleans up an un-persisted temp file).
1644struct AtomicTemp {
1645    file: Option<fs::File>,
1646    path: PathBuf,
1647    persisted: bool,
1648}
1649
1650impl AtomicTemp {
1651    fn write_all(&mut self, bytes: &[u8]) -> std::io::Result<()> {
1652        self.file.as_mut().expect("temp file open").write_all(bytes)
1653    }
1654    fn flush(&mut self) -> std::io::Result<()> {
1655        self.file.as_mut().expect("temp file open").flush()
1656    }
1657    fn persist(mut self, dest: &Path) -> std::io::Result<()> {
1658        if let Some(f) = self.file.take() {
1659            f.sync_all().ok();
1660            // `f` dropped here, closing the handle before the rename.
1661        }
1662        fs::rename(&self.path, dest)?;
1663        self.persisted = true;
1664        Ok(())
1665    }
1666}
1667
1668impl Drop for AtomicTemp {
1669    fn drop(&mut self) {
1670        // Best-effort cleanup if not persisted (an error path bailed out).
1671        if !self.persisted {
1672            let _ = fs::remove_file(&self.path);
1673        }
1674    }
1675}
1676
1677fn tempfile_in(dir: &Path) -> std::io::Result<AtomicTemp> {
1678    use std::time::{SystemTime, UNIX_EPOCH};
1679    let nanos = SystemTime::now()
1680        .duration_since(UNIX_EPOCH)
1681        .map(|d| d.as_nanos())
1682        .unwrap_or(0);
1683    let pid = std::process::id();
1684    // Monotonic-ish unique suffix; the dir is the destination dir so rename is
1685    // same-filesystem and therefore atomic.
1686    let counter = next_temp_counter();
1687    let name = format!(".dbmd-index-{pid}-{nanos}-{counter}.tmp");
1688    let path = dir.join(name);
1689    let file = fs::OpenOptions::new()
1690        .write(true)
1691        .create_new(true)
1692        .open(&path)?;
1693    Ok(AtomicTemp {
1694        file: Some(file),
1695        path,
1696        persisted: false,
1697    })
1698}
1699
1700fn next_temp_counter() -> u64 {
1701    use std::sync::atomic::{AtomicU64, Ordering};
1702    static C: AtomicU64 = AtomicU64::new(0);
1703    C.fetch_add(1, Ordering::Relaxed)
1704}
1705
1706#[cfg(test)]
1707mod tests {
1708    use super::*;
1709    use std::collections::BTreeSet;
1710    use std::fs;
1711    use tempfile::TempDir;
1712
1713    // ── fixtures ─────────────────────────────────────────────────────────
1714
1715    /// A temp store with a `DB.md` marker. `store.config` is the parser default
1716    /// (these tests never exercise the config parser).
1717    fn mk_store() -> (TempDir, Store) {
1718        let dir = TempDir::new().unwrap();
1719        fs::write(dir.path().join("DB.md"), "# test store\n").unwrap();
1720        let store = Store {
1721            root: dir.path().to_path_buf(),
1722            config: crate::parser::Config::default(),
1723        };
1724        (dir, store)
1725    }
1726
1727    /// Write a content file at `rel` with the given frontmatter lines + body.
1728    /// `fm` is the raw YAML body between the fences (no `---`).
1729    fn write_raw(store: &Store, rel: &str, fm: &str, body: &str) {
1730        let abs = store.root.join(rel);
1731        fs::create_dir_all(abs.parent().unwrap()).unwrap();
1732        fs::write(&abs, format!("---\n{fm}\n---\n{body}")).unwrap();
1733    }
1734
1735    /// Convenience: write a typed content file with summary/updated/extras.
1736    fn write_doc(
1737        store: &Store,
1738        rel: &str,
1739        type_: &str,
1740        summary: Option<&str>,
1741        updated: Option<&str>,
1742        extra_yaml: &str,
1743    ) {
1744        let mut fm = format!("type: {type_}\n");
1745        if let Some(s) = summary {
1746            fm.push_str(&format!("summary: {s}\n"));
1747        }
1748        if let Some(u) = updated {
1749            fm.push_str(&format!("updated: {u}\n"));
1750        }
1751        fm.push_str(extra_yaml);
1752        write_raw(store, rel, fm.trim_end(), "\nbody text\n");
1753    }
1754
1755    fn read(store: &Store, rel: &str) -> String {
1756        fs::read_to_string(store.root.join(rel)).unwrap()
1757    }
1758
1759    fn exists(store: &Store, rel: &str) -> bool {
1760        store.root.join(rel).exists()
1761    }
1762
1763    /// Collect every `index.md` + `index.jsonl` under the store, mapped to its
1764    /// bytes — the surface the byte-identity invariant compares.
1765    fn snapshot_artifacts(store: &Store) -> BTreeMap<String, String> {
1766        let mut out = BTreeMap::new();
1767        for entry in walkdir::WalkDir::new(&store.root)
1768            .into_iter()
1769            .filter_map(|e| e.ok())
1770        {
1771            let p = entry.path();
1772            if is_index_artifact(p) {
1773                let rel = path_to_unix(&rel_to_store(&store.root, p).unwrap());
1774                out.insert(rel, fs::read_to_string(p).unwrap());
1775            }
1776        }
1777        out
1778    }
1779
1780    // ── build_type_folder + to_markdown ──────────────────────────────────
1781
1782    #[test]
1783    fn type_folder_aggregates_across_shards_in_recency_order() {
1784        let (_d, store) = mk_store();
1785        // Three emails across two month-shards, deliberately written
1786        // out-of-recency-order on disk.
1787        write_doc(
1788            &store,
1789            "sources/emails/2026/05/b-old.md",
1790            "email",
1791            Some("Older mail"),
1792            Some("2026-05-01T09:00:00Z"),
1793            "",
1794        );
1795        write_doc(
1796            &store,
1797            "sources/emails/2026/06/c-new.md",
1798            "email",
1799            Some("Newest mail"),
1800            Some("2026-06-15T12:00:00Z"),
1801            "",
1802        );
1803        write_doc(
1804            &store,
1805            "sources/emails/2026/05/a-mid.md",
1806            "email",
1807            Some("Middle mail"),
1808            Some("2026-05-20T08:00:00Z"),
1809            "",
1810        );
1811
1812        let idx = Index::build_type_folder(&store, Path::new("sources/emails")).unwrap();
1813        let paths: Vec<String> = idx.records.iter().map(|r| path_to_unix(&r.path)).collect();
1814        assert_eq!(
1815            paths,
1816            vec![
1817                "sources/emails/2026/06/c-new.md",
1818                "sources/emails/2026/05/a-mid.md",
1819                "sources/emails/2026/05/b-old.md",
1820            ],
1821            "records must aggregate across shards, newest `updated` first"
1822        );
1823    }
1824
1825    #[test]
1826    fn type_folder_md_format_entries_tags_and_derived_updated() {
1827        let (_d, store) = mk_store();
1828        write_doc(
1829            &store,
1830            "records/contacts/sarah-chen.md",
1831            "contact",
1832            Some("Renewal champion at Acme"),
1833            Some("2026-05-27T10:00:00Z"),
1834            "tags:\n  - renewal\n  - acme\n",
1835        );
1836        write_doc(
1837            &store,
1838            "records/contacts/no-tags.md",
1839            "contact",
1840            Some("Plain contact"),
1841            Some("2026-05-26T10:00:00Z"),
1842            "",
1843        );
1844
1845        let idx = Index::build_type_folder(&store, Path::new("records/contacts")).unwrap();
1846        let md = idx.to_markdown();
1847
1848        // Frontmatter is exact and the index's own `updated` is the MAX member
1849        // updated (the determinism the byte-identity invariant rests on).
1850        assert!(md.starts_with(
1851            "---\ntype: index\nscope: type-folder\nfolder: records/contacts\nupdated: 2026-05-27T10:00:00Z\n---\n\n# records/contacts\n"
1852        ), "frontmatter/heading wrong:\n{md}");
1853
1854        // Entry with tags: `— summary  ·  #tag #tag`.
1855        assert!(
1856            md.contains(
1857                "- [[records/contacts/sarah-chen]] — Renewal champion at Acme  ·  #renewal #acme\n"
1858            ),
1859            "tagged entry wrong:\n{md}"
1860        );
1861        // Entry without tags omits the `  ·  ` suffix entirely.
1862        assert!(
1863            md.contains("- [[records/contacts/no-tags]] — Plain contact\n"),
1864            "untagged entry wrong:\n{md}"
1865        );
1866        assert!(
1867            !md.contains("Plain contact  ·"),
1868            "untagged entry must not emit a tag separator"
1869        );
1870        // No `## More` below the cap.
1871        assert!(!md.contains("## More"), "no footer expected under the cap");
1872    }
1873
1874    #[test]
1875    fn missing_summary_becomes_placeholder_not_invented() {
1876        let (_d, store) = mk_store();
1877        write_doc(
1878            &store,
1879            "records/notes/x.md",
1880            "note",
1881            None,
1882            Some("2026-05-27T10:00:00Z"),
1883            "",
1884        );
1885        let idx = Index::build_type_folder(&store, Path::new("records/notes")).unwrap();
1886        assert_eq!(idx.records[0].summary, MISSING_SUMMARY);
1887        let md = idx.to_markdown();
1888        assert!(
1889            md.contains("- [[records/notes/x]] — (no summary)\n"),
1890            "missing summary must render the placeholder, not invent text:\n{md}"
1891        );
1892    }
1893
1894    // ── to_jsonl ─────────────────────────────────────────────────────────
1895
1896    #[test]
1897    fn jsonl_is_complete_structured_and_round_trips() {
1898        let (_d, store) = mk_store();
1899        write_doc(
1900            &store,
1901            "records/expenses/2026/05/e1.md",
1902            "expense",
1903            Some("Lunch with vendor"),
1904            Some("2026-05-10T10:00:00Z"),
1905            "created: 2026-05-10T09:00:00Z\nstatus: paid\namount: 42\ncompany: [[records/companies/acme]]\nrelated:\n  - [[wiki/themes/spend]]\ntags:\n  - food\nlinks:\n  - wiki/themes/spend\n  - [[wiki/themes/renewal]]\n",
1906        );
1907        write_doc(
1908            &store,
1909            "records/expenses/2026/06/e2.md",
1910            "expense",
1911            Some("Cloud bill"),
1912            Some("2026-06-01T10:00:00Z"),
1913            "amount: 100\n",
1914        );
1915
1916        let idx = Index::build_type_folder(&store, Path::new("records/expenses")).unwrap();
1917        let jsonl = idx.to_jsonl();
1918        let lines: Vec<&str> = jsonl.lines().collect();
1919        assert_eq!(lines.len(), 2, "one JSON object per file, uncapped");
1920
1921        // Newest first (e2), and each line parses back to an equal record.
1922        let r0: IndexRecord = serde_json::from_str(lines[0]).unwrap();
1923        assert_eq!(path_to_unix(&r0.path), "records/expenses/2026/06/e2.md");
1924        assert_eq!(
1925            r0, idx.records[0],
1926            "jsonl line must round-trip to the record"
1927        );
1928
1929        // The first (data) record carries every reserved field + the extras in
1930        // `fields` (status/amount), and links/tags verbatim.
1931        let r1: IndexRecord = serde_json::from_str(lines[1]).unwrap();
1932        assert_eq!(r1.type_, "expense");
1933        assert_eq!(r1.summary, "Lunch with vendor");
1934        assert_eq!(r1.tags, vec!["food".to_string()]);
1935        assert_eq!(
1936            r1.links,
1937            vec![
1938                "wiki/themes/spend".to_string(),
1939                "[[wiki/themes/renewal]]".to_string()
1940            ]
1941        );
1942        assert_eq!(
1943            r1.created,
1944            Some(DateTime::parse_from_rfc3339("2026-05-10T09:00:00Z").unwrap())
1945        );
1946        assert_eq!(r1.fields.get("status"), Some(&Value::from("paid")));
1947        assert_eq!(r1.fields.get("amount"), Some(&Value::from(42)));
1948        assert_eq!(
1949            r1.fields.get("company"),
1950            Some(&Value::from("[[records/companies/acme]]"))
1951        );
1952        assert_eq!(
1953            r1.fields.get("related"),
1954            Some(&serde_json::json!(["[[wiki/themes/spend]]"]))
1955        );
1956        // Reserved keys never leak into `fields`.
1957        for reserved in [
1958            "path", "type", "summary", "tags", "links", "created", "updated",
1959        ] {
1960            assert!(
1961                !r1.fields.contains_key(reserved),
1962                "reserved key {reserved} must not appear in fields"
1963            );
1964        }
1965
1966        // Stable key order: declared fields first, then sorted extras.
1967        assert!(
1968            lines[1].starts_with(
1969                r#"{"path":"records/expenses/2026/05/e1.md","type":"expense","summary":"Lunch with vendor","tags":["food"],"links":["wiki/themes/spend","[[wiki/themes/renewal]]"],"created":"2026-05-10T09:00:00Z","updated":"2026-05-10T10:00:00Z","#
1970            ),
1971            "jsonl key order not stable:\n{}",
1972            lines[1]
1973        );
1974        // The flattened extras come in BTreeMap (sorted) order. The catalog
1975        // injects `meta-type: fact` into every records-layer file that does not
1976        // declare one, so it appears among the sorted extras (between `company`
1977        // and `related`).
1978        assert!(
1979            lines[1].ends_with(r#""amount":42,"company":"[[records/companies/acme]]","meta-type":"fact","related":["[[wiki/themes/spend]]"],"status":"paid"}"#),
1980            "extras must be sorted:\n{}",
1981            lines[1]
1982        );
1983    }
1984
1985    // ── cap + footer ─────────────────────────────────────────────────────
1986
1987    #[test]
1988    fn over_cap_md_shows_500_plus_footer_jsonl_holds_all() {
1989        let (_d, store) = mk_store();
1990        let total = MD_CAP + 7;
1991        for i in 0..total {
1992            // Distinct, monotonically increasing `updated` so order is total.
1993            let day = 1 + (i % 27);
1994            let rel = format!("sources/emails/2026/05/m-{i:04}.md");
1995            let updated = format!("2026-05-{day:02}T00:00:{:02}Z", i % 60);
1996            write_doc(
1997                &store,
1998                &rel,
1999                "email",
2000                Some(&format!("mail {i}")),
2001                Some(&updated),
2002                "",
2003            );
2004        }
2005        let idx = Index::build_type_folder(&store, Path::new("sources/emails")).unwrap();
2006        assert_eq!(idx.records.len(), total, "jsonl/records keep every file");
2007
2008        let md = idx.to_markdown();
2009        let entry_lines = md.lines().filter(|l| l.starts_with("- [[")).count();
2010        assert_eq!(entry_lines, MD_CAP, "md browse view is capped at 500");
2011
2012        assert!(
2013            md.contains("## More\n\n"),
2014            "over-cap md needs a More footer"
2015        );
2016        assert!(
2017            md.contains(&format!(
2018                "This folder has {total} files. The 500 most recent are listed above.\n"
2019            )),
2020            "footer count wrong:\n{md}"
2021        );
2022        assert!(
2023            md.contains(
2024                "Use `dbmd index query --type email --in sources` for the complete catalog.\n"
2025            ),
2026            "footer must infer type=email layer=sources:\n{md}"
2027        );
2028
2029        let jsonl = idx.to_jsonl();
2030        assert_eq!(jsonl.lines().count(), total, "jsonl is uncapped");
2031    }
2032
2033    // ── sort total order ─────────────────────────────────────────────────
2034
2035    #[test]
2036    fn sort_breaks_ties_by_path_and_puts_undated_last() {
2037        let mut recs = vec![
2038            rec("z/a.md", Some("2026-05-01T00:00:00Z")),
2039            rec("a/b.md", Some("2026-05-01T00:00:00Z")), // same updated, path < z/a
2040            rec("m/c.md", None),                         // undated → last
2041            rec("b/d.md", Some("2026-06-01T00:00:00Z")), // newest
2042        ];
2043        sort_records(&mut recs);
2044        let order: Vec<String> = recs.iter().map(|r| path_to_unix(&r.path)).collect();
2045        assert_eq!(order, vec!["b/d.md", "a/b.md", "z/a.md", "m/c.md"]);
2046    }
2047
2048    fn rec(path: &str, updated: Option<&str>) -> IndexRecord {
2049        IndexRecord {
2050            path: PathBuf::from(path),
2051            type_: "t".into(),
2052            summary: "s".into(),
2053            tags: vec![],
2054            links: vec![],
2055            created: None,
2056            updated: updated.map(|u| DateTime::parse_from_rfc3339(u).unwrap()),
2057            fields: BTreeMap::new(),
2058        }
2059    }
2060
2061    // ── build_layer / build_root ─────────────────────────────────────────
2062
2063    #[test]
2064    fn layer_index_lists_type_folders_with_counts_and_preview() {
2065        let (_d, store) = mk_store();
2066        write_doc(
2067            &store,
2068            "records/contacts/a.md",
2069            "contact",
2070            Some("Contact A older"),
2071            Some("2026-05-01T00:00:00Z"),
2072            "",
2073        );
2074        write_doc(
2075            &store,
2076            "records/contacts/b.md",
2077            "contact",
2078            Some("Contact B newest"),
2079            Some("2026-05-09T00:00:00Z"),
2080            "",
2081        );
2082        write_doc(
2083            &store,
2084            "records/companies/x.md",
2085            "company",
2086            Some("Acme Inc"),
2087            Some("2026-05-05T00:00:00Z"),
2088            "",
2089        );
2090        // build the type-folder artifacts first (layer preview reads their jsonl)
2091        Index::write_level(&store, &IndexLevel::TypeFolder("records/contacts".into())).unwrap();
2092        Index::write_level(&store, &IndexLevel::TypeFolder("records/companies".into())).unwrap();
2093
2094        Index::write_level(&store, &IndexLevel::Layer(Layer::Records)).unwrap();
2095        let md = read(&store, "records/index.md");
2096
2097        assert!(
2098            md.starts_with("---\ntype: index\nscope: layer\nfolder: records\n"),
2099            "layer fm:\n{md}"
2100        );
2101        // Alphabetical type-folder order: companies before contacts.
2102        let companies_at = md.find("companies/index").unwrap();
2103        let contacts_at = md.find("contacts/index").unwrap();
2104        assert!(
2105            companies_at < contacts_at,
2106            "type folders must be alphabetical"
2107        );
2108        // Count + display + newest-summary preview.
2109        assert!(
2110            md.contains("- [[records/contacts/index|Contacts]] (2) — Contact B newest\n"),
2111            "contacts entry:\n{md}"
2112        );
2113        assert!(
2114            md.contains("- [[records/companies/index|Companies]] (1) — Acme Inc\n"),
2115            "companies entry:\n{md}"
2116        );
2117        // Layer `updated` is the max across children (contacts b = 05-09).
2118        assert!(
2119            md.contains("updated: 2026-05-09T00:00:00Z\n"),
2120            "layer updated must be max child:\n{md}"
2121        );
2122    }
2123
2124    #[test]
2125    fn root_index_groups_layers_with_totals_and_per_type_counts() {
2126        let (_d, store) = mk_store();
2127        write_doc(
2128            &store,
2129            "sources/emails/2026/05/a.md",
2130            "email",
2131            Some("Mail"),
2132            Some("2026-05-01T00:00:00Z"),
2133            "",
2134        );
2135        write_doc(
2136            &store,
2137            "sources/docs/d.md",
2138            "doc",
2139            Some("Doc"),
2140            Some("2026-05-02T00:00:00Z"),
2141            "",
2142        );
2143        write_doc(
2144            &store,
2145            "records/contacts/c.md",
2146            "contact",
2147            Some("C"),
2148            Some("2026-05-03T00:00:00Z"),
2149            "",
2150        );
2151        // wiki empty → no Wiki section
2152
2153        Index::rebuild_all(&store).unwrap();
2154        let md = read(&store, "index.md");
2155
2156        assert!(
2157            md.starts_with("---\ntype: index\nscope: root\n"),
2158            "root fm:\n{md}"
2159        );
2160        assert!(md.contains("# Knowledge base index\n"), "root title:\n{md}");
2161        // Layer heading with total count; Sources before Records (canonical).
2162        let sources_h = md
2163            .find("## Sources (2)")
2164            .expect("sources heading w/ total 2");
2165        let records_h = md
2166            .find("## Records (1)")
2167            .expect("records heading w/ total 1");
2168        assert!(sources_h < records_h, "Sources must precede Records");
2169        assert!(!md.contains("## Wiki"), "empty layer gets no section");
2170        // Per-type sub-entries with (N), no preview at root.
2171        assert!(
2172            md.contains("- [[sources/docs/index|Docs]] (1)\n"),
2173            "root docs entry:\n{md}"
2174        );
2175        assert!(
2176            md.contains("- [[sources/emails/index|Emails]] (1)\n"),
2177            "root emails entry:\n{md}"
2178        );
2179        assert!(
2180            md.contains("- [[records/contacts/index|Contacts]] (1)\n"),
2181            "root contacts entry:\n{md}"
2182        );
2183        assert!(!md.contains("— "), "root entries carry no preview text");
2184    }
2185
2186    // ── write-through == rebuild (THE invariant) ─────────────────────────
2187
2188    #[test]
2189    fn on_write_matches_rebuild_byte_for_byte() {
2190        // Build a store incrementally via on_write, and a second identical store
2191        // via a single rebuild_all, then assert every index artifact is equal.
2192        let (_d1, wt) = mk_store();
2193        let (_d2, rb) = mk_store();
2194
2195        let docs: &[(&str, &str, &str, &str, &str)] = &[
2196            (
2197                "sources/emails/2026/05/e1.md",
2198                "email",
2199                "First mail",
2200                "2026-05-01T10:00:00Z",
2201                "tags:\n  - inbox\n",
2202            ),
2203            (
2204                "sources/emails/2026/06/e2.md",
2205                "email",
2206                "Second mail",
2207                "2026-06-01T10:00:00Z",
2208                "",
2209            ),
2210            (
2211                "records/contacts/sarah.md",
2212                "contact",
2213                "Sarah",
2214                "2026-05-15T10:00:00Z",
2215                "links:\n  - wiki/people/sarah\n",
2216            ),
2217            (
2218                "records/contacts/elena.md",
2219                "contact",
2220                "Elena",
2221                "2026-05-20T10:00:00Z",
2222                "status: active\n",
2223            ),
2224            (
2225                "records/profiles/sarah.md",
2226                "profile",
2227                "Sarah bio",
2228                "2026-05-21T10:00:00Z",
2229                "",
2230            ),
2231        ];
2232
2233        for (rel, t, sum, upd, extra) in docs {
2234            write_doc(&wt, rel, t, Some(sum), Some(upd), extra);
2235            write_doc(&rb, rel, t, Some(sum), Some(upd), extra);
2236            Index::on_write(&wt, Path::new(rel)).unwrap();
2237        }
2238        Index::rebuild_all(&rb).unwrap();
2239
2240        let a = snapshot_artifacts(&wt);
2241        let b = snapshot_artifacts(&rb);
2242        assert_eq!(
2243            a.keys().collect::<Vec<_>>(),
2244            b.keys().collect::<Vec<_>>(),
2245            "same set of index artifacts must exist"
2246        );
2247        for (k, v) in &a {
2248            assert_eq!(v, &b[k], "artifact {k} differs between write-through and rebuild:\n--- write-through ---\n{v}\n--- rebuild ---\n{}", b[k]);
2249        }
2250        // Sanity: artifacts actually exist (not a vacuous comparison of empties).
2251        assert!(a.contains_key("index.md"));
2252        assert!(a.contains_key("sources/emails/index.jsonl"));
2253        assert!(a.contains_key("records/contacts/index.md"));
2254    }
2255
2256    /// Regression (O(changed) bound, not just correctness): a loop op must
2257    /// recompute its parent rollups from the type-folder `index.jsonl` sidecars
2258    /// — never by walking the content tree of *sibling* folders it wasn't asked
2259    /// about. The byte-identity property test (which always indexes every folder
2260    /// before comparing) can't catch a violation, because a full-store walk
2261    /// produces the *correct* counts too; it just does so in `O(store files)`.
2262    ///
2263    /// The behavioral fingerprint of the old `update_parents → build_layer /
2264    /// build_root` (which called `walk_type_folder_files` on every type-folder in
2265    /// the store): a single `on_write` to `records/contacts/sarah.md` would
2266    /// surface, in the layer + root rollups, the file count of
2267    /// `records/companies` — a sibling that has content on disk but was NEVER
2268    /// passed to a write/index op, so it has no `index.jsonl`. An O(changed) loop
2269    /// op cannot "see" that un-indexed folder; a whole-store walk can. So this
2270    /// asserts the rollups reflect ONLY the sidecar-indexed folder, proving no
2271    /// content-tree walk happened.
2272    #[test]
2273    fn loop_op_does_not_walk_sibling_content_tree() {
2274        let (_d, store) = mk_store();
2275
2276        // A sibling type-folder with real content on disk, but deliberately
2277        // never indexed (no on_write / write_level / rebuild over it) ⇒ no
2278        // `records/companies/index.jsonl` exists.
2279        write_doc(
2280            &store,
2281            "records/companies/acme.md",
2282            "company",
2283            Some("Acme Inc"),
2284            Some("2026-05-05T00:00:00Z"),
2285            "",
2286        );
2287        write_doc(
2288            &store,
2289            "records/companies/globex.md",
2290            "company",
2291            Some("Globex"),
2292            Some("2026-05-06T00:00:00Z"),
2293            "",
2294        );
2295        assert!(
2296            !exists(&store, "records/companies/index.jsonl"),
2297            "precondition: companies must be un-indexed"
2298        );
2299
2300        // The ONLY loop op: a single write to a different type-folder.
2301        write_doc(
2302            &store,
2303            "records/contacts/sarah.md",
2304            "contact",
2305            Some("Sarah"),
2306            Some("2026-05-15T00:00:00Z"),
2307            "",
2308        );
2309        Index::on_write(&store, Path::new("records/contacts/sarah.md")).unwrap();
2310
2311        // The written folder is reflected in both rollups...
2312        let layer_md = read(&store, "records/index.md");
2313        let root_md = read(&store, "index.md");
2314        // (layer rollup appends a summary preview, root does not)
2315        assert!(
2316            layer_md.contains("- [[records/contacts/index|Contacts]] (1) — Sarah\n"),
2317            "layer must reflect the written folder:\n{layer_md}"
2318        );
2319        assert!(
2320            root_md.contains("- [[records/contacts/index|Contacts]] (1)\n"),
2321            "root must reflect the written folder:\n{root_md}"
2322        );
2323
2324        // ...but the un-indexed sibling must be INVISIBLE to a loop op. If the
2325        // rollups mention `records/companies` at all, `on_write` walked the whole
2326        // content tree — the O(store) regression.
2327        assert!(
2328            !layer_md.contains("companies"),
2329            "loop op walked the sibling content tree: layer rollup counts un-indexed records/companies\n{layer_md}"
2330        );
2331        assert!(
2332            !root_md.contains("companies"),
2333            "loop op walked the sibling content tree: root rollup counts un-indexed records/companies\n{root_md}"
2334        );
2335        // The layer's only child is contacts ⇒ its total is exactly 1, not 3.
2336        assert!(
2337            root_md.contains("## Records (1)"),
2338            "root layer total must count only the sidecar-indexed folder (1), not walked siblings (would be 3):\n{root_md}"
2339        );
2340
2341        // And the sidecar-derived count IS what a full walk WOULD yield once the
2342        // sibling is indexed too — i.e. the fix changes cost, not the eventual
2343        // result. Index companies, then confirm the rollups now (and only now)
2344        // include it, byte-identical to a from-scratch rebuild.
2345        let (_d2, rb) = mk_store();
2346        for (rel, t, s, u) in [
2347            (
2348                "records/companies/acme.md",
2349                "company",
2350                "Acme Inc",
2351                "2026-05-05T00:00:00Z",
2352            ),
2353            (
2354                "records/companies/globex.md",
2355                "company",
2356                "Globex",
2357                "2026-05-06T00:00:00Z",
2358            ),
2359            (
2360                "records/contacts/sarah.md",
2361                "contact",
2362                "Sarah",
2363                "2026-05-15T00:00:00Z",
2364            ),
2365        ] {
2366            write_doc(&rb, rel, t, Some(s), Some(u), "");
2367        }
2368        Index::on_write(&store, Path::new("records/companies/acme.md")).unwrap();
2369        Index::on_write(&store, Path::new("records/companies/globex.md")).unwrap();
2370        Index::rebuild_all(&rb).unwrap();
2371        let a = snapshot_artifacts(&store);
2372        let b = snapshot_artifacts(&rb);
2373        assert_eq!(
2374            a.keys().collect::<BTreeSet<_>>(),
2375            b.keys().collect::<BTreeSet<_>>(),
2376            "same artifact set after indexing both folders"
2377        );
2378        for (k, v) in &a {
2379            assert_eq!(
2380                v, &b[k],
2381                "after indexing the sibling too, loop result must equal rebuild for {k}"
2382            );
2383        }
2384        assert!(
2385            read(&store, "index.md").contains("## Records (3)"),
2386            "now that both folders are indexed, the root total is 3"
2387        );
2388    }
2389
2390    /// Regression: a type filed at the path the toolkit ITSELF computes
2391    /// (`Store::shard_path_for`) must be indexable end-to-end. The class of bug
2392    /// is a 2-component `<layer>/<file>` path, which `type_folder_of` treats as
2393    /// having no type-folder — making the producer (path computation) disagree
2394    /// with the consumer (index): the loop path crashes (`on_write` → `Err`, it
2395    /// tries to write `index.md` *inside* a file) while the sweep path silently
2396    /// drops the page from every catalog. `wiki-page` is now an unrecognized
2397    /// type, so `shard_path_for` files it under the records-layer fallback
2398    /// `records/wiki-page/<file>` — a conforming 3-component path. This test
2399    /// drives both paths through the real `shard_path_for` output and asserts
2400    /// (1) `on_write` succeeds, (2) the page appears in the rebuilt catalog, and
2401    /// (3) write-through == rebuild.
2402    #[test]
2403    fn wiki_page_at_shard_path_for_is_indexable_end_to_end() {
2404        let (_d1, wt) = mk_store();
2405        let (_d2, rb) = mk_store();
2406
2407        // The toolkit's own canonical write path for a wiki-page.
2408        let rel = wt
2409            .shard_path_for(
2410                "wiki-page",
2411                &crate::parser::Frontmatter::default(),
2412                "renewal-theme",
2413            )
2414            .unwrap();
2415        let rel_str = path_to_unix(&rel);
2416        // Guard the precondition the consumer requires: 3+ components so
2417        // `type_folder_of` resolves a real `<layer>/<type-folder>`.
2418        assert!(
2419            type_folder_of(&rel).is_some(),
2420            "shard_path_for produced a path the index cannot file: {rel_str}"
2421        );
2422
2423        write_doc(
2424            &wt,
2425            &rel_str,
2426            "wiki-page",
2427            Some("Renewal theme"),
2428            Some("2026-05-21T10:00:00Z"),
2429            "",
2430        );
2431        write_doc(
2432            &rb,
2433            &rel_str,
2434            "wiki-page",
2435            Some("Renewal theme"),
2436            Some("2026-05-21T10:00:00Z"),
2437            "",
2438        );
2439
2440        // (1) Loop path must NOT error (the old `wiki/<file>` shape returned
2441        // Err(Io(NotADirectory))).
2442        Index::on_write(&wt, &rel)
2443            .expect("on_write must succeed for a toolkit-computed wiki-page path");
2444        Index::rebuild_all(&rb).unwrap();
2445
2446        // (2) The page is present in the rebuilt catalog (the old flat-path bug
2447        // silently omitted it from every artifact). The individual page link
2448        // lives in the *type-folder* index; the *layer* index rolls the
2449        // type-folder up — assert both, since the bug erased both. `wiki-page`
2450        // is now an unrecognized type, so its canonical folder is the
2451        // records-layer fallback `records/wiki-page`.
2452        let page_link = wiki_target(&rel); // records/wiki-page/renewal-theme
2453        let tf_md = read(&rb, "records/wiki-page/index.md");
2454        assert!(
2455            tf_md.contains(&format!("[[{page_link}]]")),
2456            "type-folder index must list the page link, got:\n{tf_md}"
2457        );
2458        assert!(
2459            exists(&rb, "records/wiki-page/index.jsonl"),
2460            "type-folder jsonl must exist"
2461        );
2462        assert!(
2463            read(&rb, "records/wiki-page/index.jsonl").contains(&rel_str),
2464            "type-folder jsonl must contain the page row"
2465        );
2466        // The layer index rolls the type-folder up (proves the page's folder is
2467        // visible to the layer catalog, not dropped).
2468        let layer_md = read(&rb, "records/index.md");
2469        assert!(
2470            layer_md.contains("records/wiki-page/index"),
2471            "layer index must roll up the records/wiki-page type-folder, got:\n{layer_md}"
2472        );
2473
2474        // (3) Write-through equals rebuild byte-for-byte — loop and sweep agree.
2475        let a = snapshot_artifacts(&wt);
2476        let b = snapshot_artifacts(&rb);
2477        assert_eq!(
2478            a.keys().collect::<Vec<_>>(),
2479            b.keys().collect::<Vec<_>>(),
2480            "loop and sweep must produce the same artifact set"
2481        );
2482        for (k, v) in &a {
2483            assert_eq!(
2484                v, &b[k],
2485                "wiki-page artifact {k} differs between on_write and rebuild"
2486            );
2487        }
2488    }
2489
2490    #[test]
2491    fn on_remove_then_rebuild_match_and_pull_in_next_over_cap() {
2492        let (_d1, wt) = mk_store();
2493        let (_d2, rb) = mk_store();
2494        let total = MD_CAP + 3; // 503 files; removing one keeps md full at 500
2495        let mut all_rels = Vec::new();
2496        for i in 0..total {
2497            let rel = format!("sources/emails/2026/05/m-{i:04}.md");
2498            // `updated` strictly increasing across i by varying both minute and second
2499            let updated = format!("2026-05-10T00:{:02}:{:02}Z", i / 60, i % 60);
2500            write_doc(
2501                &wt,
2502                &rel,
2503                "email",
2504                Some(&format!("mail {i}")),
2505                Some(&updated),
2506                "",
2507            );
2508            write_doc(
2509                &rb,
2510                &rel,
2511                "email",
2512                Some(&format!("mail {i}")),
2513                Some(&updated),
2514                "",
2515            );
2516            all_rels.push(rel);
2517        }
2518        // Build write-through index, then remove the single newest file.
2519        Index::rebuild_all(&wt).unwrap();
2520        let newest = &all_rels[total - 1]; // highest i = newest updated
2521        fs::remove_file(wt.root.join(newest)).unwrap();
2522        Index::on_remove(&wt, Path::new(newest)).unwrap();
2523
2524        // Rebuild side: same end state (file physically absent).
2525        fs::remove_file(rb.root.join(newest)).unwrap();
2526        Index::rebuild_all(&rb).unwrap();
2527
2528        let a = snapshot_artifacts(&wt);
2529        let b = snapshot_artifacts(&rb);
2530        for (k, v) in &a {
2531            assert_eq!(v, &b[k], "after remove, artifact {k} drifted from rebuild");
2532        }
2533
2534        // The md must still hold exactly 500 entries (the 501st got pulled in)
2535        // and the removed file must be gone from both artifacts.
2536        let md = read(&wt, "sources/emails/index.md");
2537        assert_eq!(md.lines().filter(|l| l.starts_with("- [[")).count(), MD_CAP);
2538        // Removed (newest) file is gone from the bare-path md and the .md jsonl.
2539        assert!(
2540            !md.contains(&format!("[[{}]]", wiki_target(Path::new(newest)))),
2541            "removed file must not be listed in md"
2542        );
2543        // The file previously at rank 501 (excluded under the cap) is `all_rels[2]`
2544        // — `updated` increases with index, so newest-first rank 500 = index 2.
2545        // After dropping the newest it shifts into the visible 500.
2546        let pulled_in = &all_rels[2];
2547        assert!(
2548            md.contains(&format!("[[{}]]", wiki_target(Path::new(pulled_in)))),
2549            "the 501st-most-recent must be pulled into the browse view after a removal"
2550        );
2551        assert!(
2552            md.contains(&format!("This folder has {} files.", total - 1)),
2553            "footer count must decrement:\n{}",
2554            md.lines().rev().take(4).collect::<Vec<_>>().join("\n")
2555        );
2556        let jsonl = read(&wt, "sources/emails/index.jsonl");
2557        assert_eq!(
2558            jsonl.lines().count(),
2559            total - 1,
2560            "jsonl loses exactly the removed file"
2561        );
2562        assert!(
2563            !jsonl.contains(&path_to_unix(Path::new(newest))),
2564            "removed file must be gone from the jsonl too"
2565        );
2566    }
2567
2568    #[test]
2569    fn on_rename_cross_folder_matches_rebuild() {
2570        let (_d1, wt) = mk_store();
2571        let (_d2, rb) = mk_store();
2572        // Seed both stores identically.
2573        let seed: &[(&str, &str, &str, &str)] = &[
2574            (
2575                "records/contacts/a.md",
2576                "contact",
2577                "A",
2578                "2026-05-01T00:00:00Z",
2579            ),
2580            (
2581                "records/contacts/b.md",
2582                "contact",
2583                "B",
2584                "2026-05-02T00:00:00Z",
2585            ),
2586            (
2587                "records/companies/x.md",
2588                "company",
2589                "X",
2590                "2026-05-03T00:00:00Z",
2591            ),
2592        ];
2593        for (rel, t, s, u) in seed {
2594            write_doc(&wt, rel, t, Some(s), Some(u), "");
2595            write_doc(&rb, rel, t, Some(s), Some(u), "");
2596        }
2597        Index::rebuild_all(&wt).unwrap();
2598
2599        // Rename contacts/b.md -> companies/b.md (cross type-folder). The file's
2600        // `type` changes to match its new folder, as a real `dbmd rename` would.
2601        let old = "records/contacts/b.md";
2602        let new = "records/companies/b.md";
2603        fs::create_dir_all(wt.root.join("records/companies")).unwrap();
2604        fs::rename(wt.root.join(old), wt.root.join(new)).unwrap();
2605        // (type stays "contact" here; index copies frontmatter verbatim — the
2606        // test only asserts placement + parity with rebuild.)
2607        Index::on_rename(&wt, Path::new(old), Path::new(new)).unwrap();
2608
2609        // Rebuild side: same end state.
2610        fs::create_dir_all(rb.root.join("records/companies")).unwrap();
2611        fs::rename(rb.root.join(old), rb.root.join(new)).unwrap();
2612        Index::rebuild_all(&rb).unwrap();
2613
2614        let a = snapshot_artifacts(&wt);
2615        let b = snapshot_artifacts(&rb);
2616        assert_eq!(a.keys().collect::<Vec<_>>(), b.keys().collect::<Vec<_>>());
2617        for (k, v) in &a {
2618            assert_eq!(v, &b[k], "rename: artifact {k} drifted from rebuild");
2619        }
2620        // Concretely: b is gone from contacts, present in companies.
2621        let contacts = read(&wt, "records/contacts/index.md");
2622        assert!(!contacts.contains("records/contacts/b]]"));
2623        let companies = read(&wt, "records/companies/index.md");
2624        assert!(companies.contains("[[records/companies/b]]"));
2625    }
2626
2627    #[test]
2628    fn on_write_updates_existing_entry_in_place() {
2629        let (_d, store) = mk_store();
2630        write_doc(
2631            &store,
2632            "records/contacts/a.md",
2633            "contact",
2634            Some("Original"),
2635            Some("2026-05-01T00:00:00Z"),
2636            "",
2637        );
2638        Index::on_write(&store, Path::new("records/contacts/a.md")).unwrap();
2639        // Edit the same file: new summary + newer updated.
2640        write_doc(
2641            &store,
2642            "records/contacts/a.md",
2643            "contact",
2644            Some("Revised"),
2645            Some("2026-05-09T00:00:00Z"),
2646            "",
2647        );
2648        Index::on_write(&store, Path::new("records/contacts/a.md")).unwrap();
2649
2650        let jsonl = read(&store, "records/contacts/index.jsonl");
2651        assert_eq!(
2652            jsonl.lines().count(),
2653            1,
2654            "upsert must not duplicate the line"
2655        );
2656        assert!(jsonl.contains("Revised"), "jsonl must reflect the update");
2657        assert!(
2658            !jsonl.contains("Original"),
2659            "stale line must be gone (compacted)"
2660        );
2661        let md = read(&store, "records/contacts/index.md");
2662        assert!(md.contains("- [[records/contacts/a]] — Revised\n"));
2663        assert!(
2664            md.contains("updated: 2026-05-09T00:00:00Z\n"),
2665            "index updated must track the newer member"
2666        );
2667    }
2668
2669    // ── dry-run + cleanup ────────────────────────────────────────────────
2670
2671    #[test]
2672    fn dry_run_emits_separators_and_writes_nothing() {
2673        let (_d, store) = mk_store();
2674        write_doc(
2675            &store,
2676            "sources/emails/2026/05/a.md",
2677            "email",
2678            Some("Mail"),
2679            Some("2026-05-01T00:00:00Z"),
2680            "",
2681        );
2682        let out = Index::render_dry_run(&store, &IndexLevel::TypeFolder("sources/emails".into()))
2683            .unwrap();
2684        assert!(
2685            out.contains("--- sources/emails/index.md ---\n"),
2686            "md separator:\n{out}"
2687        );
2688        assert!(
2689            out.contains("--- sources/emails/index.jsonl ---\n"),
2690            "jsonl separator:\n{out}"
2691        );
2692        assert!(
2693            out.contains("- [[sources/emails/2026/05/a]] — Mail"),
2694            "md body present"
2695        );
2696        // Nothing was written to disk.
2697        assert!(
2698            !exists(&store, "sources/emails/index.md"),
2699            "dry-run must not write"
2700        );
2701        assert!(
2702            !exists(&store, "sources/emails/index.jsonl"),
2703            "dry-run must not write"
2704        );
2705    }
2706
2707    #[test]
2708    fn cleanup_removes_noncanonical_and_empty_indexes() {
2709        let (_d, store) = mk_store();
2710        write_doc(
2711            &store,
2712            "sources/emails/2026/05/a.md",
2713            "email",
2714            Some("Mail"),
2715            Some("2026-05-01T00:00:00Z"),
2716            "",
2717        );
2718        // A stray index inside a date-shard (non-canonical) ...
2719        fs::write(
2720            store.root.join("sources/emails/2026/05/index.md"),
2721            "stale\n",
2722        )
2723        .unwrap();
2724        fs::write(
2725            store.root.join("sources/emails/2026/05/index.jsonl"),
2726            "stale\n",
2727        )
2728        .unwrap();
2729        // ... and an index in an empty type-folder.
2730        fs::create_dir_all(store.root.join("records/empty")).unwrap();
2731        fs::write(store.root.join("records/empty/index.md"), "stale\n").unwrap();
2732
2733        Index::cleanup(&store).unwrap();
2734
2735        assert!(
2736            !exists(&store, "sources/emails/2026/05/index.md"),
2737            "shard index must be deleted"
2738        );
2739        assert!(
2740            !exists(&store, "sources/emails/2026/05/index.jsonl"),
2741            "shard jsonl must be deleted"
2742        );
2743        assert!(
2744            !exists(&store, "records/empty/index.md"),
2745            "empty-folder index must be deleted"
2746        );
2747        // The canonical type-folder file itself is untouched by cleanup.
2748        assert!(exists(&store, "sources/emails/2026/05/a.md"));
2749    }
2750
2751    #[test]
2752    fn rebuild_deletes_stale_indexes_for_emptied_folders() {
2753        let (_d, store) = mk_store();
2754        write_doc(
2755            &store,
2756            "records/contacts/a.md",
2757            "contact",
2758            Some("A"),
2759            Some("2026-05-01T00:00:00Z"),
2760            "",
2761        );
2762        Index::rebuild_all(&store).unwrap();
2763        assert!(exists(&store, "records/contacts/index.md"));
2764        assert!(exists(&store, "records/index.md"));
2765        assert!(exists(&store, "index.md"));
2766
2767        // Empty the folder entirely, then rebuild: all three levels vanish.
2768        fs::remove_file(store.root.join("records/contacts/a.md")).unwrap();
2769        Index::rebuild_all(&store).unwrap();
2770        assert!(
2771            !exists(&store, "records/contacts/index.md"),
2772            "emptied type-folder index gone"
2773        );
2774        assert!(
2775            !exists(&store, "records/index.md"),
2776            "now-empty layer index gone"
2777        );
2778        assert!(!exists(&store, "index.md"), "now-empty root index gone");
2779    }
2780
2781    // ── randomized parity (property-style) ───────────────────────────────
2782
2783    #[test]
2784    fn property_writethrough_equals_rebuild_under_mixed_ops() {
2785        // Deterministic pseudo-random op sequence (no rand crate): a small LCG.
2786        let (_d1, wt) = mk_store();
2787        let (_d2, rb) = mk_store();
2788        let mut seed: u64 = 0x9E3779B97F4A7C15;
2789        let mut next = || {
2790            seed = seed
2791                .wrapping_mul(6364136223846793005)
2792                .wrapping_add(1442695040888963407);
2793            (seed >> 33) as u32
2794        };
2795
2796        let folders = ["sources/emails", "records/contacts", "records/profiles"];
2797        let types = ["email", "contact", "profile"];
2798        let mut live: Vec<String> = Vec::new(); // store-relative paths that exist
2799
2800        for step in 0..120u32 {
2801            let r = next();
2802            let op = r % 10;
2803            if op < 6 || live.is_empty() {
2804                // CREATE/UPDATE
2805                let fi = (next() as usize) % folders.len();
2806                let folder = folders[fi];
2807                let id = next() % 40;
2808                let rel = if folder == "sources/emails" {
2809                    let month = 5 + (id % 2); // shard across two months
2810                    format!("{folder}/2026/{month:02}/f-{id:02}.md")
2811                } else {
2812                    format!("{folder}/f-{id:02}.md")
2813                };
2814                // recency varies with step so order is meaningful + total
2815                let updated = format!(
2816                    "2026-05-{:02}T{:02}:{:02}:00Z",
2817                    1 + (step % 27),
2818                    step % 24,
2819                    id % 60
2820                );
2821                let extra = if id % 3 == 0 {
2822                    "tags:\n  - x\n  - y\n"
2823                } else {
2824                    ""
2825                };
2826                write_doc(
2827                    &wt,
2828                    &rel,
2829                    types[fi],
2830                    Some(&format!("sum {step}")),
2831                    Some(&updated),
2832                    extra,
2833                );
2834                write_doc(
2835                    &rb,
2836                    &rel,
2837                    types[fi],
2838                    Some(&format!("sum {step}")),
2839                    Some(&updated),
2840                    extra,
2841                );
2842                Index::on_write(&wt, Path::new(&rel)).unwrap();
2843                if !live.contains(&rel) {
2844                    live.push(rel);
2845                }
2846            } else if op < 8 {
2847                // REMOVE a live file
2848                let idx = (next() as usize) % live.len();
2849                let rel = live.remove(idx);
2850                fs::remove_file(wt.root.join(&rel)).unwrap();
2851                fs::remove_file(rb.root.join(&rel)).ok();
2852                Index::on_remove(&wt, Path::new(&rel)).unwrap();
2853            } else {
2854                // RENAME a live file within the same layer (new id, maybe new type-folder)
2855                let idx = (next() as usize) % live.len();
2856                let old = live[idx].clone();
2857                // pick a destination folder in the same layer-ish set
2858                let fi = (next() as usize) % folders.len();
2859                let folder = folders[fi];
2860                let id = 50 + (next() % 40);
2861                let new = if folder == "sources/emails" {
2862                    format!("{folder}/2026/05/f-{id:02}.md")
2863                } else {
2864                    format!("{folder}/f-{id:02}.md")
2865                };
2866                if new == old || live.contains(&new) {
2867                    continue;
2868                }
2869                fs::create_dir_all(wt.root.join(&new).parent().unwrap()).unwrap();
2870                fs::create_dir_all(rb.root.join(&new).parent().unwrap()).unwrap();
2871                fs::rename(wt.root.join(&old), wt.root.join(&new)).unwrap();
2872                fs::rename(rb.root.join(&old), rb.root.join(&new)).unwrap();
2873                Index::on_rename(&wt, Path::new(&old), Path::new(&new)).unwrap();
2874                live[idx] = new;
2875            }
2876        }
2877
2878        // Now rebuild the rb side from the shared end state and compare.
2879        Index::rebuild_all(&rb).unwrap();
2880        let a = snapshot_artifacts(&wt);
2881        let b = snapshot_artifacts(&rb);
2882        assert_eq!(
2883            a.keys().collect::<BTreeSet<_>>(),
2884            b.keys().collect::<BTreeSet<_>>(),
2885            "write-through and rebuild must produce the same set of artifacts"
2886        );
2887        for (k, v) in &a {
2888            assert_eq!(
2889                v, &b[k],
2890                "INVARIANT VIOLATED: artifact {k} differs after mixed ops\n--- write-through ---\n{v}\n--- rebuild ---\n{}",
2891                b[k]
2892            );
2893        }
2894        assert!(
2895            !a.is_empty(),
2896            "the run must have produced at least one artifact"
2897        );
2898    }
2899
2900    // ── regressions: cleanup must not delete user content ─────────────────
2901
2902    /// CRITICAL regression: a user content file named `index.md` inside a date
2903    /// shard (e.g. from a website/doc-export mirror) must SURVIVE `cleanup` /
2904    /// `rebuild_all`. The old filename-only match silently deleted it.
2905    #[test]
2906    fn cleanup_preserves_user_content_named_index_md_in_shard() {
2907        let (_d, store) = mk_store();
2908        // A real content record that merely happens to be named index.md.
2909        write_doc(
2910            &store,
2911            "sources/emails/2026/06/index.md",
2912            "email",
2913            Some("Important imported mail"),
2914            Some("2026-06-11T04:23:25Z"),
2915            "",
2916        );
2917        Index::cleanup(&store).unwrap();
2918        assert!(
2919            exists(&store, "sources/emails/2026/06/index.md"),
2920            "cleanup must not delete a user content file named index.md"
2921        );
2922        // A full rebuild (which runs cleanup first) must also preserve it.
2923        Index::rebuild_all(&store).unwrap();
2924        assert!(
2925            exists(&store, "sources/emails/2026/06/index.md"),
2926            "rebuild_all must not delete a user content file named index.md"
2927        );
2928        let kept = read(&store, "sources/emails/2026/06/index.md");
2929        assert!(
2930            kept.contains("Important imported mail"),
2931            "the user's record content must be intact"
2932        );
2933    }
2934
2935    /// HIGH regression: `cleanup` uses `min_depth(2)`, so the canonical
2936    /// type-folder-root `index.md`/`index.jsonl` are NOT deleted up front. A
2937    /// genuine generated catalog at the type-folder root survives a cleanup pass
2938    /// (it is only ever rewritten, or removed when the folder is truly empty).
2939    #[test]
2940    fn cleanup_keeps_canonical_type_folder_root_sidecars() {
2941        let (_d, store) = mk_store();
2942        write_doc(
2943            &store,
2944            "records/contacts/alice.md",
2945            "contact",
2946            Some("Alice"),
2947            Some("2026-05-01T00:00:00Z"),
2948            "",
2949        );
2950        Index::write_level(&store, &IndexLevel::TypeFolder("records/contacts".into())).unwrap();
2951        assert!(exists(&store, "records/contacts/index.md"));
2952        assert!(exists(&store, "records/contacts/index.jsonl"));
2953        Index::cleanup(&store).unwrap();
2954        assert!(
2955            exists(&store, "records/contacts/index.md"),
2956            "cleanup must keep the canonical type-folder index.md (non-empty folder)"
2957        );
2958        assert!(
2959            exists(&store, "records/contacts/index.jsonl"),
2960            "cleanup must keep the canonical type-folder index.jsonl (non-empty folder)"
2961        );
2962    }
2963
2964    // ── regression: write-through must not catalog index artifacts ────────
2965
2966    /// HIGH regression: routing a generated `index.md` through `on_write` (as
2967    /// `dbmd fm set records/contacts/index.md …` would) must NOT insert a phantom
2968    /// self-row — counts and bytes stay equal to a rebuild.
2969    #[test]
2970    fn on_write_ignores_index_artifact_no_phantom_row() {
2971        let (_d, store) = mk_store();
2972        write_doc(
2973            &store,
2974            "records/contacts/alice.md",
2975            "contact",
2976            Some("Alice"),
2977            Some("2026-05-01T00:00:00Z"),
2978            "",
2979        );
2980        Index::on_write(&store, Path::new("records/contacts/alice.md")).unwrap();
2981        let jsonl_before = read(&store, "records/contacts/index.jsonl");
2982        assert_eq!(jsonl_before.lines().count(), 1);
2983
2984        // Tamper: route the catalog file itself through on_write.
2985        Index::on_write(&store, Path::new("records/contacts/index.md")).unwrap();
2986
2987        let jsonl_after = read(&store, "records/contacts/index.jsonl");
2988        assert_eq!(
2989            jsonl_after.lines().count(),
2990            1,
2991            "on_write on index.md must not add a phantom self-row"
2992        );
2993        assert!(
2994            !jsonl_after.contains("\"type\":\"index\""),
2995            "the catalog artifact must never appear as a catalogued row"
2996        );
2997        // Root rollup count stays 1 (not inflated to 2).
2998        let root = read(&store, "index.md");
2999        assert!(
3000            root.contains("[[records/contacts/index|Contacts]] (1)"),
3001            "count must not inflate:\n{root}"
3002        );
3003    }
3004
3005    // ── regression: multi-line summary cannot inject a catalog line ───────
3006
3007    /// HIGH regression: a block-scalar summary spanning multiple lines must be
3008    /// collapsed to one line in the browse entry, so it cannot forge a standalone
3009    /// `- [[…]]` catalog line.
3010    #[test]
3011    fn multiline_summary_is_single_lined_in_index_md() {
3012        let (_d, store) = mk_store();
3013        // A YAML block scalar whose value embeds a forged-looking entry line.
3014        write_raw(
3015            &store,
3016            "records/notes/evil.md",
3017            "type: note\nupdated: 2026-06-10T00:00:00Z\nsummary: |-\n  legit first line\n  - [[records/secrets/fake|Click me]] — injected entry",
3018            "\nbody\n",
3019        );
3020        let idx = Index::build_type_folder(&store, Path::new("records/notes")).unwrap();
3021        let md = idx.to_markdown();
3022        // Exactly one browse entry line, and no embedded newline forging a second.
3023        let entry_lines = md.lines().filter(|l| l.starts_with("- [[")).count();
3024        assert_eq!(
3025            entry_lines, 1,
3026            "a multi-line summary must not produce extra entry lines:\n{md}"
3027        );
3028        assert!(
3029            md.contains(
3030                "- [[records/notes/evil]] — legit first line - [[records/secrets/fake|Click me]] — injected entry\n"
3031            ),
3032            "summary newlines must collapse to spaces inline:\n{md}"
3033        );
3034    }
3035
3036    // ── regression: writer/validator scalar coercion agreement ────────────
3037
3038    /// HIGH regression: an unquoted non-string scalar `summary`/`type`
3039    /// (`summary: 2026`, `type: true`) must be coerced to a string by the index
3040    /// writer exactly as `validate::scalar_string` does — so the index entry holds
3041    /// the real value (`2026`), not the `(no summary)` placeholder that produced a
3042    /// permanently-unfixable INDEX_SUMMARY_MISMATCH.
3043    #[test]
3044    fn non_string_scalar_summary_and_type_are_coerced_like_validator() {
3045        let (_d, store) = mk_store();
3046        write_raw(
3047            &store,
3048            "records/contacts/a.md",
3049            "type: contact\nupdated: 2026-05-01T00:00:00Z\nsummary: 2026",
3050            "\nbody\n",
3051        );
3052        let rec = record_from_file(
3053            &store.root.join("records/contacts/a.md"),
3054            PathBuf::from("records/contacts/a.md"),
3055        )
3056        .unwrap();
3057        // `summary: 2026` (YAML number) coerces to the string "2026", matching
3058        // the validator's `scalar_string` (Number -> n.to_string()).
3059        assert_eq!(rec.summary, "2026");
3060        assert_eq!(rec.type_, "contact");
3061
3062        // And the rendered index entry quotes the real value, not the placeholder.
3063        let idx = Index::build_type_folder(&store, Path::new("records/contacts")).unwrap();
3064        let md = idx.to_markdown();
3065        assert!(
3066            md.contains("- [[records/contacts/a]] — 2026\n"),
3067            "index entry must hold the coerced scalar, not the placeholder:\n{md}"
3068        );
3069
3070        // A boolean scalar type coerces to "true" (mirrors scalar_string(Bool)).
3071        write_raw(
3072            &store,
3073            "records/contacts/b.md",
3074            "type: true\nupdated: 2026-05-02T00:00:00Z\nsummary: hi",
3075            "\nbody\n",
3076        );
3077        let rec_b = record_from_file(
3078            &store.root.join("records/contacts/b.md"),
3079            PathBuf::from("records/contacts/b.md"),
3080        )
3081        .unwrap();
3082        assert_eq!(rec_b.type_, "true");
3083    }
3084
3085    // ── regression: non-UTF-8 body must not abort the projection ──────────
3086
3087    /// HIGH regression: a content file with valid-UTF-8 frontmatter but a
3088    /// non-UTF-8 byte in the BODY (a verbatim Latin-1 `sources/` import) must
3089    /// still project to an IndexRecord — `record_from_file` reads frontmatter
3090    /// without requiring the whole file to be UTF-8, so a stray byte can't abort
3091    /// `rebuild_all` / write-through for the entire store.
3092    #[test]
3093    fn non_utf8_body_does_not_abort_record_projection() {
3094        let (_d, store) = mk_store();
3095        let rel = "sources/emails/2026/06/x.md";
3096        let abs = store.root.join(rel);
3097        fs::create_dir_all(abs.parent().unwrap()).unwrap();
3098        // Valid-UTF-8 frontmatter; a raw 0xE9 (Latin-1 'é') in the body.
3099        let mut bytes: Vec<u8> =
3100            b"---\ntype: email\nupdated: 2026-06-11T00:00:00Z\nsummary: An imported email\n---\n\nCaf"
3101                .to_vec();
3102        bytes.push(0xE9);
3103        bytes.extend_from_slice(b" meeting notes\n");
3104        fs::write(&abs, bytes).unwrap();
3105
3106        let rec = record_from_file(&abs, PathBuf::from(rel))
3107            .expect("non-UTF-8 body must not abort the frontmatter read");
3108        assert_eq!(rec.summary, "An imported email");
3109        assert_eq!(rec.type_, "email");
3110
3111        // The full sweep indexes the folder rather than aborting the whole store.
3112        Index::rebuild_all(&store).unwrap();
3113        assert!(
3114            exists(&store, "sources/emails/index.jsonl"),
3115            "rebuild must produce the catalog despite a non-UTF-8 body byte"
3116        );
3117        assert!(
3118            read(&store, "sources/emails/index.jsonl").contains("An imported email"),
3119            "the record must be catalogued"
3120        );
3121    }
3122
3123    /// HIGH regression: a single malformed-YAML file must abort the rebuild
3124    /// loudly (not be silently skipped) — skipping it would leave the store in a
3125    /// permanently invalid state (`INDEX_MISSING_ENTRY` / `INDEX_JSONL_DESYNC`
3126    /// that no rebuild clears, since the validator enumerates members by
3127    /// filename, not by parseability) and would desync the rollups. The abort is
3128    /// safe because `cleanup` preserves the prior canonical catalogs
3129    /// (`min_depth(2)`), so an aborted rebuild leaves the existing sidecars
3130    /// intact and surfaces a clear error naming the file to fix.
3131    #[test]
3132    fn rebuild_aborts_on_malformed_file_and_keeps_prior_catalogs() {
3133        let (_d, store) = mk_store();
3134        write_doc(
3135            &store,
3136            "records/contacts/alice.md",
3137            "contact",
3138            Some("Alice"),
3139            Some("2026-05-01T00:00:00Z"),
3140            "",
3141        );
3142        write_doc(
3143            &store,
3144            "records/companies/acme.md",
3145            "company",
3146            Some("Acme"),
3147            Some("2026-05-02T00:00:00Z"),
3148            "",
3149        );
3150
3151        // A clean first rebuild establishes the canonical catalogs.
3152        Index::rebuild_all(&store).expect("clean rebuild succeeds");
3153        assert!(exists(&store, "records/contacts/index.jsonl"));
3154        assert!(exists(&store, "records/companies/index.jsonl"));
3155
3156        // Routine malformed file: unterminated quoted scalar.
3157        let bad = store.root.join("records/contacts/broken.md");
3158        fs::write(
3159            &bad,
3160            "---\ntype: contact\nsummary: \"unterminated\n---\nbody\n",
3161        )
3162        .unwrap();
3163
3164        // Must abort loudly — a silent skip leaves a file the validator requires
3165        // to be catalogued out of the index forever.
3166        Index::rebuild_all(&store)
3167            .expect_err("rebuild must abort, not silently skip, on a malformed file");
3168
3169        // The prior canonical catalogs survive the aborted rebuild: `cleanup`'s
3170        // `min_depth(2)` never deletes a type-folder's root-level sidecars, so a
3171        // mid-sweep abort leaves the existing indexes intact rather than wiped.
3172        assert!(
3173            exists(&store, "records/companies/index.jsonl"),
3174            "an aborted rebuild must not destroy a clean sibling folder's catalog"
3175        );
3176        assert!(
3177            exists(&store, "records/contacts/index.jsonl"),
3178            "an aborted rebuild must not destroy the affected folder's prior catalog"
3179        );
3180        let contacts_jsonl = read(&store, "records/contacts/index.jsonl");
3181        assert!(contacts_jsonl.contains("records/contacts/alice.md"));
3182    }
3183
3184    /// HIGH regression (problem B): `rebuild_all`'s rollup `(N)` counts must
3185    /// equal the catalogued `index.jsonl` record counts — never a raw `.md` walk
3186    /// that disagrees with the sidecar. The over-corrected skip-with-diagnostic
3187    /// build excluded a malformed file from `index.jsonl` while `build_layer` /
3188    /// `build_root` kept counting it via `walk_type_folder_files`, so a folder
3189    /// would show `Contacts (2)` in the root/layer rollups while its `index.jsonl`
3190    /// held only 1 record — and a single subsequent write-through (which derives
3191    /// `(N)` from the jsonl) rewrote it to `Contacts (1)`, making `rebuild_all`
3192    /// and write-through emit different bytes for the same state. With the loud
3193    /// abort, the only successful-rebuild states are fully consistent: every
3194    /// rollup `(N)` equals the catalogued record count AND equals what a
3195    /// write-through over the same files produces.
3196    #[test]
3197    fn rebuild_rollup_counts_equal_jsonl_records_and_write_through() {
3198        let (_d, store) = mk_store();
3199        // Two well-formed contacts: the rollups must read (2), matching the two
3200        // jsonl records — this is the count the skip-version inflated to a phantom
3201        // extra when a malformed sibling was present-but-uncatalogued.
3202        write_doc(
3203            &store,
3204            "records/contacts/alice.md",
3205            "contact",
3206            Some("Alice"),
3207            Some("2026-05-01T00:00:00Z"),
3208            "",
3209        );
3210        write_doc(
3211            &store,
3212            "records/contacts/bob.md",
3213            "contact",
3214            Some("Bob"),
3215            Some("2026-05-02T00:00:00Z"),
3216            "",
3217        );
3218        Index::rebuild_all(&store).expect("clean rebuild succeeds");
3219
3220        // The catalogued record set (index.jsonl) and the rollup (N) must agree.
3221        let jsonl_lines = read(&store, "records/contacts/index.jsonl")
3222            .lines()
3223            .filter(|l| !l.trim().is_empty())
3224            .count();
3225        assert_eq!(jsonl_lines, 2, "two well-formed files ⇒ two jsonl records");
3226        let layer_md = read(&store, "records/index.md");
3227        let root_md = read(&store, "index.md");
3228        assert!(
3229            layer_md.contains("- [[records/contacts/index|Contacts]] (2)"),
3230            "layer rollup (N) must equal the jsonl record count (2), not a raw .md walk:\n{layer_md}"
3231        );
3232        assert!(
3233            root_md.contains("- [[records/contacts/index|Contacts]] (2)\n")
3234                && root_md.contains("## Records (2)"),
3235            "root rollup (N)/layer total must equal the jsonl record count (2):\n{root_md}"
3236        );
3237
3238        // The decisive write-through == rebuild_all byte-identity check on the
3239        // SAME end state: a single on_write must not rewrite the rollups to a
3240        // different (N). Under the skip-version, rebuild_all's rollup walked the
3241        // raw .md tree while on_write derived (N) from the jsonl, so the two
3242        // diverged; the loud abort keeps both deriving (N) from the catalogued
3243        // records, so the bytes match exactly.
3244        let (_d2, wt) = mk_store();
3245        write_doc(
3246            &wt,
3247            "records/contacts/alice.md",
3248            "contact",
3249            Some("Alice"),
3250            Some("2026-05-01T00:00:00Z"),
3251            "",
3252        );
3253        write_doc(
3254            &wt,
3255            "records/contacts/bob.md",
3256            "contact",
3257            Some("Bob"),
3258            Some("2026-05-02T00:00:00Z"),
3259            "",
3260        );
3261        Index::on_write(&wt, Path::new("records/contacts/alice.md")).unwrap();
3262        Index::on_write(&wt, Path::new("records/contacts/bob.md")).unwrap();
3263
3264        let a = snapshot_artifacts(&wt);
3265        let b = snapshot_artifacts(&store);
3266        assert_eq!(
3267            a.keys().collect::<BTreeSet<_>>(),
3268            b.keys().collect::<BTreeSet<_>>(),
3269            "write-through and rebuild_all must produce the same artifact set"
3270        );
3271        for (k, v) in &a {
3272            assert_eq!(
3273                v, &b[k],
3274                "rollup bytes diverged between write-through and rebuild_all for {k} \
3275                 (a skip-version inflates rebuild_all's (N) above the jsonl record \
3276                 count, which write-through then rewrites):\n--- write-through ---\n{v}\n--- rebuild ---\n{}",
3277                b[k]
3278            );
3279        }
3280    }
3281
3282    /// MEDIUM regression: a non-UTF-8 path component must be lossily decoded
3283    /// (kept, with U+FFFD), not silently dropped — so the index key points at the
3284    /// file, not its parent directory. Unix-only (ext4 allows the filename; APFS
3285    /// rejects it at the VFS layer).
3286    #[cfg(unix)]
3287    #[test]
3288    fn non_utf8_path_component_is_kept_not_dropped() {
3289        use std::ffi::OsStr;
3290        use std::os::unix::ffi::OsStrExt;
3291        // sources/emails/caf\xE9.md — the leaf has a non-UTF-8 byte.
3292        let mut leaf = b"caf".to_vec();
3293        leaf.push(0xE9);
3294        leaf.extend_from_slice(b".md");
3295        let p = Path::new("sources/emails").join(OsStr::from_bytes(&leaf));
3296        let unix = path_to_unix(&p);
3297        // The leaf is preserved (lossy), so the path is NOT collapsed to the
3298        // parent directory "sources/emails".
3299        assert_ne!(
3300            unix, "sources/emails",
3301            "non-UTF-8 leaf must not be dropped, collapsing the path to its parent dir"
3302        );
3303        assert!(
3304            unix.starts_with("sources/emails/caf"),
3305            "the lossy leaf must remain under its folder: {unix}"
3306        );
3307    }
3308}