dbmd_core/
parser.rs

1//! `parser` — read and write db.md markdown files.
2//!
3//! Parses the YAML frontmatter block, the markdown body, wiki-links, standard
4//! markdown links, `##` sections, and the structured sections of the `DB.md`
5//! config file. Also the atomic writer that round-trips a file while
6//! preserving the operator-edited body verbatim and emitting frontmatter in
7//! canonical key order.
8//!
9//! Strict on required fields, lenient on unknowns: any frontmatter key the
10//! spec doesn't recognize is preserved in [`Frontmatter::extra`] as ambient
11//! context and round-tripped untouched.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use chrono::{DateTime, FixedOffset};
17use serde_norway::{Mapping, Value};
18
19/// The two canonical layer folder names. A path is "content" / a wiki-link is
20/// "full-path" only when it resolves under one of these.
21const LAYER_DIRS: [&str; 2] = ["sources", "records"];
22
23/// Errors produced while parsing a markdown file or the `DB.md` config.
24#[derive(Debug, thiserror::Error)]
25pub enum ParseError {
26    /// The frontmatter block was not valid YAML. Maps to validate code
27    /// `FM_MALFORMED_YAML`.
28    #[error("malformed YAML frontmatter in {file}: {source}")]
29    MalformedYaml {
30        /// The file whose frontmatter failed to parse.
31        file: PathBuf,
32        /// The underlying YAML error.
33        source: serde_norway::Error,
34    },
35
36    /// The file has no `---`-delimited frontmatter block at its very start.
37    #[error("missing frontmatter block in {file}")]
38    MissingFrontmatter {
39        /// The offending file.
40        file: PathBuf,
41    },
42
43    /// A required field was absent. Maps to validate code `FM_MISSING_TYPE`
44    /// (for `type`) and the per-type required-field codes.
45    #[error("missing required field '{key}' in {file}")]
46    MissingField {
47        /// The file missing the field.
48        file: PathBuf,
49        /// The required key.
50        key: String,
51    },
52
53    /// A timestamp field was not ISO-8601 / RFC3339. Maps to `FM_BAD_TIMESTAMP`.
54    #[error("bad timestamp in field '{key}' of {file}: {value}")]
55    BadTimestamp {
56        /// The file.
57        file: PathBuf,
58        /// The frontmatter key.
59        key: String,
60        /// The unparseable value.
61        value: String,
62    },
63
64    /// An I/O error reading the file.
65    #[error(transparent)]
66    Io(#[from] std::io::Error),
67}
68
69/// The parsed YAML frontmatter of a db.md file.
70///
71/// The universal-contract fields are typed accessors; everything else lands in
72/// [`extra`](Frontmatter::extra) as ambient context (unknown-field passthrough)
73/// and is round-tripped verbatim. The atomic writer re-emits keys in canonical
74/// order: `type`, `id`, `created`, `updated`, `summary` first, then
75/// type-specific fields, then `status` / `tags`.
76#[derive(Debug, Clone, Default, PartialEq)]
77pub struct Frontmatter {
78    /// `type` — required on content files; the primary query key.
79    pub type_: Option<String>,
80    /// `meta-type` — records-only; the epistemic class `fact`/`operational`/
81    /// `conclusion`. Absent ⇒ `fact` (the effective default is applied by the
82    /// index/query layer for record-layer files; sources carry none).
83    pub meta_type: Option<String>,
84    /// `id` — optional; derived from the file path when absent.
85    pub id: Option<String>,
86    /// `created` — RFC3339; required and auto-set on content-file create.
87    pub created: Option<DateTime<FixedOffset>>,
88    /// `updated` — RFC3339; required and auto-maintained on content files.
89    pub updated: Option<DateTime<FixedOffset>>,
90    /// `summary` — the one-line catalog line; required on every content file.
91    pub summary: Option<String>,
92    /// `status` — optional lifecycle state.
93    pub status: Option<String>,
94    /// `tags` — optional flat list of short scalar labels.
95    pub tags: Vec<String>,
96    /// All other frontmatter keys (type-specific + custom), preserved verbatim
97    /// in insertion-stable sorted order. Wiki-link-valued fields keep their raw
98    /// YAML form here; [`Frontmatter::link_fields`] surfaces them as
99    /// [`WikiLink`]s.
100    pub extra: BTreeMap<String, Value>,
101}
102
103/// Does `s` contain a run of at least `min` consecutive ASCII digits? A cheap
104/// guard so [`quote_oversized_integers`] only does real work when an oversized
105/// literal is even possible (`i64::MAX` is 19 digits, `u64::MAX` is 20).
106fn has_long_digit_run(s: &str, min: usize) -> bool {
107    let mut run = 0usize;
108    for b in s.bytes() {
109        if b.is_ascii_digit() {
110            run += 1;
111            if run >= min {
112                return true;
113            }
114        } else {
115            run = 0;
116        }
117    }
118    false
119}
120
121/// True if `s` is a bare decimal integer literal whose magnitude exceeds the
122/// `i64`/`u64` range `serde_norway` can represent losslessly — exactly the
123/// literals it either rejects (`(u64::MAX, u128::MAX]`) or silently truncates to
124/// `f64` (`> u128::MAX`). A canonical (no leading zero) decimal only, so an
125/// octal/leading-zero/typed scalar is never reinterpreted.
126fn is_oversized_int_literal(s: &str) -> bool {
127    let t = s.trim();
128    if t.is_empty() {
129        return false;
130    }
131    let (neg, body) = match t.strip_prefix('-') {
132        Some(b) => (true, b),
133        None => (false, t.strip_prefix('+').unwrap_or(t)),
134    };
135    if body.is_empty() || !body.bytes().all(|b| b.is_ascii_digit() || b == b'_') {
136        return false;
137    }
138    let digits: String = body
139        .bytes()
140        .filter(|b| *b != b'_')
141        .map(|b| b as char)
142        .collect();
143    if digits.is_empty() {
144        return false; // all underscores
145    }
146    // Leading-zero decimals (`007`) are version-ambiguous (octal vs int vs
147    // string); never touch them.
148    if digits.len() > 1 && digits.starts_with('0') {
149        return false;
150    }
151    let canon = if neg { format!("-{digits}") } else { digits };
152    // Fits i64 / u64 → handled losslessly; leave untouched.
153    if canon.parse::<i64>().is_ok() || (!neg && canon.parse::<u64>().is_ok()) {
154        return false;
155    }
156    true
157}
158
159/// Byte index where the scalar VALUE begins on a simple block line
160/// (`key: <value>`, `- <value>`, or `- key: <value>`), or `None` when the line
161/// bears no inline value (a bare `key:` / lone `-` / indent-only line).
162fn scalar_value_start(content: &str) -> Option<usize> {
163    let mut base = content.len() - content.trim_start().len();
164    let mut rest = &content[base..];
165    // Consume leading `- ` block-sequence markers (possibly nested: `- - x`).
166    while let Some(after) = rest.strip_prefix("- ") {
167        base += rest.len() - after.len();
168        let trimmed = after.trim_start_matches(' ');
169        base += after.len() - trimmed.len();
170        rest = trimmed;
171    }
172    if rest.is_empty() || rest == "-" {
173        return None;
174    }
175    // `key: value` — first `:` followed by a space/tab introduces the value.
176    if let Some(colon) = rest.find(':') {
177        let after = &rest[colon + 1..];
178        if after.starts_with(' ') || after.starts_with('\t') {
179            let val = after.trim_start_matches([' ', '\t']);
180            return Some(base + colon + 1 + (after.len() - val.len()));
181        }
182        if after.is_empty() {
183            return None; // `key:` with the value on following (block) lines
184        }
185    }
186    // A bare sequence-item scalar: the value is the whole remainder.
187    Some(base)
188}
189
190/// True if `content` introduces a YAML block scalar (`key: |`, `- >2`, …): the
191/// value region begins with a `|` or `>` indicator. Its body must be skipped by
192/// [`quote_oversized_integers`] so a digit line inside literal text is untouched.
193fn introduces_block_scalar(content: &str) -> bool {
194    match scalar_value_start(content) {
195        Some(start) => {
196            let v = content[start..].trim_start();
197            v.starts_with('|') || v.starts_with('>')
198        }
199        None => false,
200    }
201}
202
203/// Quote an oversized bare-integer value on a single block line, returning the
204/// rewritten line, or `None` if the line carries no such value.
205///
206/// Handles two value shapes:
207/// - a bare scalar value (`key: <int>`, `- <int>`, `- key: <int>`), and
208/// - a single-line flow collection value (`key: [ … ]` / `key: { … }`) holding
209///   one or more oversized integer literals (possibly mixed with in-range ints,
210///   strings, and nested flow collections) — see [`quote_oversized_ints_in_flow`].
211///
212/// In both cases only the offending integer scalar(s) are single-quoted; every
213/// other byte is preserved exactly.
214fn quote_int_value_in_line(content: &str) -> Option<String> {
215    let value_start = scalar_value_start(content)?;
216    let region = &content[value_start..];
217    let value = region.trim_end();
218
219    // Single-line flow collection: scan inside it for oversized int literals.
220    // (A bare scalar never starts with `[`/`{`, so these arms are disjoint.)
221    if value.starts_with('[') || value.starts_with('{') {
222        let trailing = &region[value.len()..];
223        let rewritten = quote_oversized_ints_in_flow(value)?;
224        return Some(format!(
225            "{}{}{}",
226            &content[..value_start],
227            rewritten,
228            trailing
229        ));
230    }
231
232    if !is_oversized_int_literal(value) {
233        return None;
234    }
235    // A pure digit literal contains no `'`, so single-quoting needs no escaping.
236    let trailing = &region[value.len()..];
237    Some(format!(
238        "{}'{}'{}",
239        &content[..value_start],
240        value,
241        trailing
242    ))
243}
244
245/// Scan a single-line YAML flow collection (`[ … ]` / `{ … }`) and single-quote
246/// each oversized bare-integer literal it contains, returning the rewritten flow
247/// text, or `None` when it holds no such literal (so the caller can leave the
248/// line untouched and `changed` stays false for an unaffected file).
249///
250/// The flow grammar is tokenized by its structural characters — `[ ] { } , :` —
251/// at the top level: text between two structural characters (and outside any
252/// single/double quoted scalar) is one plain scalar. A plain scalar whose trimmed
253/// form is an oversized canonical decimal integer (per [`is_oversized_int_literal`])
254/// is wrapped in single quotes; everything else — in-range ints, quoted strings,
255/// floats, booleans, nested collections, the structural punctuation and all
256/// surrounding whitespace — is emitted verbatim. Nested collections and multiple
257/// literals on one line are handled by the same single left-to-right pass.
258fn quote_oversized_ints_in_flow(flow: &str) -> Option<String> {
259    let mut out = String::with_capacity(flow.len() + 2);
260    let mut changed = false;
261    // Byte offset where the current plain-scalar token began (None ⇒ not inside
262    // a plain scalar, e.g. just after a structural char or inside a quote).
263    let mut scalar_start: Option<usize> = None;
264    let bytes = flow.as_bytes();
265    let mut i = 0usize;
266
267    // Flush the plain scalar spanning `[start, end)`: quote it iff it is an
268    // oversized integer literal, otherwise copy it through verbatim.
269    fn flush(out: &mut String, flow: &str, start: usize, end: usize, changed: &mut bool) {
270        let raw = &flow[start..end];
271        let trimmed = raw.trim();
272        if !trimmed.is_empty() && is_oversized_int_literal(trimmed) {
273            // Preserve the token's incidental leading/trailing whitespace; only
274            // the literal itself is quoted. A pure-digit literal contains no `'`,
275            // so single-quoting needs no escaping.
276            let lead = &raw[..raw.len() - raw.trim_start().len()];
277            let tail = &raw[raw.trim_end().len()..];
278            out.push_str(lead);
279            out.push('\'');
280            out.push_str(trimmed);
281            out.push('\'');
282            out.push_str(tail);
283            *changed = true;
284        } else {
285            out.push_str(raw);
286        }
287    }
288
289    while i < bytes.len() {
290        let b = bytes[i];
291        match b {
292            // Quoted scalars: copy through verbatim, skipping their contents so a
293            // structural char or digit run inside a string is never reinterpreted.
294            b'\'' | b'"' => {
295                if let Some(start) = scalar_start.take() {
296                    flush(&mut out, flow, start, i, &mut changed);
297                }
298                let quote = b;
299                let str_start = i;
300                i += 1;
301                while i < bytes.len() {
302                    if bytes[i] == quote {
303                        // A doubled single-quote (`''`) is an escaped quote inside
304                        // a single-quoted YAML scalar, not the closing delimiter.
305                        if quote == b'\'' && i + 1 < bytes.len() && bytes[i + 1] == b'\'' {
306                            i += 2;
307                            continue;
308                        }
309                        // A backslash-escaped quote inside a double-quoted scalar
310                        // does not close it.
311                        if quote == b'"' && bytes[i - 1] == b'\\' {
312                            i += 1;
313                            continue;
314                        }
315                        i += 1;
316                        break;
317                    }
318                    i += 1;
319                }
320                out.push_str(&flow[str_start..i]);
321            }
322            // Structural characters end the current plain scalar and are copied
323            // through. `:` separates a flow-mapping key from its value; `,`
324            // separates entries; brackets/braces open or close a (possibly
325            // nested) collection.
326            b'[' | b']' | b'{' | b'}' | b',' | b':' => {
327                if let Some(start) = scalar_start.take() {
328                    flush(&mut out, flow, start, i, &mut changed);
329                }
330                out.push(b as char);
331                i += 1;
332            }
333            _ => {
334                if scalar_start.is_none() {
335                    scalar_start = Some(i);
336                }
337                i += 1;
338            }
339        }
340    }
341    if let Some(start) = scalar_start.take() {
342        flush(&mut out, flow, start, bytes.len(), &mut changed);
343    }
344
345    if changed {
346        Some(out)
347    } else {
348        None
349    }
350}
351
352/// Pre-quote bare integer literals beyond the `i64`/`u64` range so they parse as
353/// STRING scalars and round-trip verbatim.
354///
355/// `serde_norway` (no arbitrary-precision) cannot represent such an integer: it
356/// rejects `(u64::MAX, u128::MAX]` as a hard parse error and silently truncates
357/// `> u128::MAX` to `f64` (`999…9` → `1e39` on the next re-emit) — corrupting an
358/// imported numeric ID and breaking the SPEC guarantee that unknown fields
359/// round-trip byte-for-byte. Quoting them up front makes them string-valued (the
360/// type narrows from number to string, but no data is destroyed).
361///
362/// Conservative: only a canonical decimal integer beyond `i64`/`u64` is quoted —
363/// whether it appears as a bare value (`key: <int>` / `- <int>` / `- key: <int>`)
364/// or as a scalar inside a single-line flow collection (`key: [ … ]` /
365/// `key: { … }`, including nested collections and mixed/multiple literals); block
366/// scalars are tracked and never touched; anything already in range, quoted, or
367/// not a bare integer is left exactly as written.
368fn quote_oversized_integers(yaml: &str) -> std::borrow::Cow<'_, str> {
369    if !has_long_digit_run(yaml, 19) {
370        return std::borrow::Cow::Borrowed(yaml);
371    }
372    let mut out = String::with_capacity(yaml.len());
373    let mut changed = false;
374    let mut block_indent: Option<usize> = None;
375    for line in yaml.split_inclusive('\n') {
376        let content = line.trim_end_matches(['\r', '\n']);
377        let term = &line[content.len()..];
378        let indent = content.len() - content.trim_start().len();
379
380        // Inside a block scalar: emit verbatim until a non-blank line dedents to
381        // at or before the introducer's key indent.
382        if let Some(key_indent) = block_indent {
383            if content.trim().is_empty() || indent > key_indent {
384                out.push_str(line);
385                continue;
386            }
387            block_indent = None; // block ended; process this line normally
388        }
389        if introduces_block_scalar(content) {
390            block_indent = Some(indent);
391            out.push_str(line);
392            continue;
393        }
394        match quote_int_value_in_line(content) {
395            Some(rewritten) => {
396                out.push_str(&rewritten);
397                out.push_str(term);
398                changed = true;
399            }
400            None => out.push_str(line),
401        }
402    }
403    if changed {
404        std::borrow::Cow::Owned(out)
405    } else {
406        std::borrow::Cow::Borrowed(yaml)
407    }
408}
409
410impl Frontmatter {
411    /// Parse a YAML frontmatter block (the text between the opening and closing
412    /// `---` fences, exclusive) into a [`Frontmatter`].
413    ///
414    /// Lenient on unknown keys (they go to [`extra`](Frontmatter::extra));
415    /// returns [`ParseError::MalformedYaml`] only on YAML that doesn't parse.
416    pub fn parse(yaml: &str, file: &Path) -> Result<Self, ParseError> {
417        // An empty (or whitespace-only) frontmatter block is a valid, empty
418        // mapping — not a YAML error.
419        let value: Value = if yaml.trim().is_empty() {
420            Value::Mapping(Mapping::new())
421        } else {
422            // Preserve integer literals beyond i64/u64 range: serde_norway would
423            // otherwise reject `(u64,u128]` or silently truncate `>u128` to f64,
424            // corrupting imported numeric IDs. Quoting them up front makes them
425            // round-trip verbatim as strings.
426            let prepared = quote_oversized_integers(yaml);
427            serde_norway::from_str(&prepared).map_err(|source| ParseError::MalformedYaml {
428                file: file.to_path_buf(),
429                source,
430            })?
431        };
432
433        // Top-level frontmatter must be a mapping. A scalar or sequence at the
434        // top level is malformed for our purposes; surface it as such.
435        let map = match value {
436            Value::Mapping(m) => m,
437            Value::Null => Mapping::new(),
438            other => {
439                // serde_norway::Error has no public constructor, so let the
440                // deserializer decide: a value that coerces to a Mapping (e.g. a
441                // YAML-tagged mapping `!tag\n k: v`, where the tag is ambient) is
442                // accepted as that mapping; a genuine scalar or sequence top
443                // level fails to coerce and IS the malformed case. (Using a
444                // match here, not `expect_err`, avoids a panic on the
445                // tagged-mapping case, which deserializes to a Mapping just
446                // fine.)
447                match serde_norway::from_value::<Mapping>(other) {
448                    Ok(m) => m,
449                    Err(source) => {
450                        return Err(ParseError::MalformedYaml {
451                            file: file.to_path_buf(),
452                            source,
453                        });
454                    }
455                }
456            }
457        };
458
459        let mut fm = Frontmatter::default();
460        for (k, v) in map {
461            let key = match k.as_str() {
462                Some(s) => s.to_string(),
463                // Non-string keys (`2026:`, `true:`, `3.14:`) are unusual but
464                // valid YAML; per SPEC § "Unknown fields pass through" they must
465                // not be corrupted on re-emit. Stringify them through the YAML
466                // scalar emitter — `2026`, `true`, `3.14` — NOT the Rust `Debug`
467                // formatter (which produced `Number(2026)`, `Bool(true)`, …), so
468                // the key text survives. `extra` is `String`-keyed, so on the
469                // write side the key re-emits as a quoted-string key carrying that
470                // text (e.g. `'2026':`) — the type narrows from number to string,
471                // but the data is no longer destroyed and ordinary string keys are
472                // wholly unaffected.
473                None => yaml_scalar_key(&k),
474            };
475            match key.as_str() {
476                // Coerce scalar values rather than `v.as_str()` (which is None
477                // for Number/Bool/Null). A bare scalar that YAML reads as a
478                // non-string — `summary: 2026`, `id: 100`, `status: 0` — would
479                // otherwise be set to None AND dropped (it is a matched arm, so
480                // the raw value never reaches `extra`), and `to_yaml` then omits
481                // the None field, so `dbmd format` (read_file -> write_file)
482                // silently deletes the line from disk. `scalar_string` mirrors
483                // the coercion `validate`/`store` already apply to these fields,
484                // so a numeric/bool-looking scalar is preserved as its string
485                // form and round-trips instead of being destroyed.
486                //
487                // A sequence/mapping value on a universal key (`status: [a, b]`,
488                // a nested-mapping `summary:`) is NOT a valid scalar; rather than
489                // let the matched arm consume-and-drop it (silent data loss on
490                // the next re-emit), `scalar_string` returns None and we fall
491                // through to preserving the raw value in `extra` so `to_yaml`
492                // re-emits it verbatim. The universal accessors stay None (the
493                // value was never a valid scalar for that field), but the
494                // operator's bytes are never destroyed.
495                "type" => match scalar_string(&v) {
496                    Some(s) => fm.type_ = Some(s),
497                    None => {
498                        fm.extra.insert(key, v);
499                    }
500                },
501                "meta-type" => match scalar_string(&v) {
502                    Some(s) => fm.meta_type = Some(s),
503                    None => {
504                        fm.extra.insert(key, v);
505                    }
506                },
507                "id" => match scalar_string(&v) {
508                    Some(s) => fm.id = Some(s),
509                    None => {
510                        fm.extra.insert(key, v);
511                    }
512                },
513                "created" => fm.created = parse_timestamp(&v, "created", file)?,
514                "updated" => fm.updated = parse_timestamp(&v, "updated", file)?,
515                "summary" => match scalar_string(&v) {
516                    Some(s) => fm.summary = Some(s),
517                    None => {
518                        fm.extra.insert(key, v);
519                    }
520                },
521                "status" => match scalar_string(&v) {
522                    Some(s) => fm.status = Some(s),
523                    None => {
524                        fm.extra.insert(key, v);
525                    }
526                },
527                "tags" => match parse_tags_preserving(&v) {
528                    Ok(tags) => fm.tags = tags,
529                    // A `tags` value with a non-scalar item (`tags: [[vip]]`,
530                    // `tags: [a, [b]]`) is preserved verbatim in `extra` rather
531                    // than silently filtered down / erased on re-emit. The typed
532                    // `tags` vec stays empty (no valid scalar list was present),
533                    // so `to_yaml` won't ALSO emit a `tags:` from the vec.
534                    Err(raw) => {
535                        fm.extra.insert(key, raw);
536                    }
537                },
538                _ => {
539                    fm.extra.insert(key, v);
540                }
541            }
542        }
543
544        // Disambiguate the one YAML shape `serde_norway` cannot tell apart on its
545        // own: an *inline scalar* wiki-link `field: [[x]]` and a *genuine 2D
546        // array* `field:`\n`- - x` BOTH parse to the identical
547        // `Seq[ Seq[String("x")] ]`. The parsed `Value` has lost which one the
548        // source wrote, but the source text has not — so we resolve it here, the
549        // only place the original spelling is still visible. For every `extra`
550        // key the source wrote in the inline `[[…]]` form, store the canonical
551        // quoted scalar `String("[[x]]")` instead of the ambiguous nested
552        // sequence. `to_yaml`/`canonicalize_extra_value` then emit it inline and
553        // round-trip it (SPEC § Linking, `company: [[…]]`), while a real nested
554        // array — which never appears in inline-link source form — stays a
555        // sequence and is preserved verbatim rather than silently retyped.
556        for key in inline_scalar_link_keys(yaml) {
557            if let Some(value) = fm.extra.get_mut(&key) {
558                // The parsed value of an inline `key: [[x]]` is the one-element
559                // outer `Seq[ Seq[String(x)] ]`; `unquoted_inline_link` reads the
560                // inner `Seq[String(x)]`, so unwrap the lone outer item first.
561                if let Value::Sequence(items) = value {
562                    if items.len() == 1 {
563                        if let Some(link) = unquoted_inline_link(&items[0]) {
564                            *value = Value::String(wiki_link_literal(&link));
565                        }
566                    }
567                }
568            }
569        }
570
571        Ok(fm)
572    }
573
574    /// Serialize the frontmatter back to a YAML block (no `---` fences) in
575    /// canonical key order. Round-trips [`extra`](Frontmatter::extra) verbatim.
576    pub fn to_yaml(&self) -> String {
577        // Build an order-preserving mapping in canonical key order:
578        //   type, meta-type, id, created, updated, summary  (universal head)
579        //   <type-specific extra, BTreeMap-sorted>
580        //   status, tags                          (universal tail)
581        // serde_norway::Mapping preserves insertion order, so one serialize call
582        // emits the block in exactly this order with correct YAML quoting.
583        let mut map = Mapping::new();
584
585        if let Some(t) = &self.type_ {
586            map.insert(Value::String("type".into()), Value::String(t.clone()));
587        }
588        if let Some(mt) = &self.meta_type {
589            map.insert(Value::String("meta-type".into()), Value::String(mt.clone()));
590        }
591        if let Some(id) = &self.id {
592            map.insert(Value::String("id".into()), Value::String(id.clone()));
593        }
594        if let Some(created) = &self.created {
595            map.insert(
596                Value::String("created".into()),
597                Value::String(created.to_rfc3339()),
598            );
599        }
600        if let Some(updated) = &self.updated {
601            map.insert(
602                Value::String("updated".into()),
603                Value::String(updated.to_rfc3339()),
604            );
605        }
606        if let Some(summary) = &self.summary {
607            map.insert(
608                Value::String("summary".into()),
609                Value::String(summary.clone()),
610            );
611        }
612
613        // Type-specific + custom fields, in BTreeMap (sorted) order. Each value
614        // is canonicalized so a wiki-link round-trips to the form the writer and
615        // `dbmd validate` agree on — critically, the SPEC-canonical *unquoted*
616        // scalar `field: [[x]]` (which YAML parses to a nested `Seq[Seq[String]]`)
617        // is re-emitted as a quoted scalar `'[[x]]'` instead of the bracket-less
618        // block sequence `- - x` that a verbatim re-emit would produce and that
619        // destroys the link. See [`canonicalize_extra_value`].
620        for (k, v) in &self.extra {
621            map.insert(Value::String(k.clone()), canonicalize_extra_value(v));
622        }
623
624        if let Some(status) = &self.status {
625            map.insert(
626                Value::String("status".into()),
627                Value::String(status.clone()),
628            );
629        }
630        if !self.tags.is_empty() {
631            map.insert(
632                Value::String("tags".into()),
633                Value::Sequence(self.tags.iter().cloned().map(Value::String).collect()),
634            );
635        }
636
637        if map.is_empty() {
638            return String::new();
639        }
640        serde_norway::to_string(&Value::Mapping(map)).unwrap_or_default()
641    }
642
643    /// True if the file is content (under `sources/` or `records/`)
644    /// and not an `index.md`. Used by validate to decide which files require a
645    /// `summary`. Meta files (`DB.md`, `index.md`, `log.md`) return false.
646    pub fn is_content_file(path: &Path) -> bool {
647        // index.md is a meta file at every level, never content.
648        if path.file_name().and_then(|n| n.to_str()) == Some("index.md") {
649            return false;
650        }
651        // Content iff some path component is one of the two layer dirs. This
652        // works for both store-relative (`sources/emails/x.md`) and absolute
653        // (`/home/db/sources/emails/x.md`) paths. DB.md / log.md sit at the
654        // root, under no layer, so they fall through to false.
655        path.components().any(|c| {
656            c.as_os_str()
657                .to_str()
658                .is_some_and(|s| LAYER_DIRS.contains(&s))
659        })
660    }
661
662    /// Resolve the file's effective `id`: the explicit `id` field if present,
663    /// otherwise derived from the store-relative path (filename without `.md`).
664    pub fn effective_id(&self, store_relative_path: &Path) -> String {
665        if let Some(id) = &self.id {
666            if !id.is_empty() {
667                return id.clone();
668            }
669        }
670        // Derived id = filename without the `.md` extension.
671        store_relative_path
672            .file_stem()
673            .and_then(|s| s.to_str())
674            .unwrap_or_default()
675            .to_string()
676    }
677
678    /// The effective `meta-type` for a record: the declared value, or `fact`
679    /// when absent. Records only — sources carry no meta-type; callers apply
680    /// this only to record-layer files.
681    pub fn effective_meta_type(&self) -> &str {
682        self.meta_type.as_deref().unwrap_or("fact")
683    }
684
685    /// Read a single frontmatter key as a raw YAML [`Value`], looking in the
686    /// typed fields first and then [`extra`](Frontmatter::extra).
687    pub fn get(&self, key: &str) -> Option<Value> {
688        match key {
689            "type" => self.type_.clone().map(Value::String),
690            "meta-type" => self.meta_type.clone().map(Value::String),
691            "id" => self.id.clone().map(Value::String),
692            "created" => self.created.map(|d| Value::String(d.to_rfc3339())),
693            "updated" => self.updated.map(|d| Value::String(d.to_rfc3339())),
694            "summary" => self.summary.clone().map(Value::String),
695            "status" => self.status.clone().map(Value::String),
696            "tags" => {
697                if self.tags.is_empty() {
698                    None
699                } else {
700                    Some(Value::Sequence(
701                        self.tags.iter().cloned().map(Value::String).collect(),
702                    ))
703                }
704            }
705            _ => self.extra.get(key).cloned(),
706        }
707    }
708
709    /// Set a single frontmatter key from a string value, routing universal-
710    /// contract keys to their typed fields and everything else to
711    /// [`extra`](Frontmatter::extra). Used by `dbmd fm set`.
712    pub fn set(&mut self, key: &str, value: &str) -> Result<(), ParseError> {
713        match key {
714            "type" => self.type_ = Some(value.to_string()),
715            "meta-type" => self.meta_type = Some(value.to_string()),
716            "id" => self.id = Some(value.to_string()),
717            "created" => {
718                self.created = Some(parse_rfc3339(value, "created", Path::new("<fm set>"))?)
719            }
720            "updated" => {
721                self.updated = Some(parse_rfc3339(value, "updated", Path::new("<fm set>"))?)
722            }
723            "summary" => self.summary = Some(value.to_string()),
724            "status" => self.status = Some(value.to_string()),
725            "tags" => {
726                // Accept either a YAML flow list (`[a, b]`) or a single scalar
727                // tag. Anything that parses to a sequence becomes the tag list;
728                // otherwise the whole string is one tag.
729                self.tags = match serde_norway::from_str::<Value>(value) {
730                    Ok(Value::Sequence(seq)) => parse_tags(&Value::Sequence(seq)),
731                    _ => vec![value.to_string()],
732                };
733            }
734            _ => {
735                // A custom / type-specific field. The value is a scalar string by
736                // default, but the spec's list-valued link fields (e.g.
737                // `meeting.attendees`, SPEC § Linking) must serialize as a YAML
738                // block sequence of quoted wiki-links — never the flow-form string
739                // `"[[[a]], [[b]]]"`, which `dbmd validate` rejects as
740                // `WIKI_LINK_FLOW_FORM_LIST`. When the value parses as a YAML
741                // sequence whose every item is a clean single wiki-link, store the
742                // canonical sequence so `to_yaml` emits block form. Everything else
743                // — plain text, and a single inline `[[x]]` (which YAML reads as a
744                // nested `Seq[Seq[String]]`, not a list of link strings) — stays a
745                // verbatim scalar string, preserving the prior behavior.
746                let stored = parse_link_list_value(value)
747                    .unwrap_or_else(|| Value::String(value.to_string()));
748                self.extra.insert(key.to_string(), stored);
749            }
750        }
751        Ok(())
752    }
753
754    /// Extract every frontmatter field whose value is a wiki-link (scalar
755    /// inline form or a block-sequence list), pairing each with its key. The
756    /// validate engine checks these against `(link)` schema annotations.
757    pub fn link_fields(&self) -> Vec<(String, WikiLink)> {
758        let mut out = Vec::new();
759        // `summary` may carry navigational wiki-links (spec encourages it).
760        if let Some(summary) = &self.summary {
761            for link in extract_wiki_links(summary, Path::new("")) {
762                out.push(("summary".to_string(), link));
763            }
764        }
765        // Every type-specific / custom field: a scalar wiki-link or a list of
766        // wiki-links, in either the quoted (`"[[x]]"`) or the canonical unquoted
767        // (`[[x]]`) form. See [`links_in_field_value`] for the YAML shapes.
768        for (key, value) in &self.extra {
769            for link in links_in_field_value(value) {
770                out.push((key.clone(), link));
771            }
772        }
773        out
774    }
775}
776
777/// A wiki-link reference inside the store: `[[target]]` or `[[target|display]]`.
778///
779/// `target` is always recorded as written; [`is_full_path`](WikiLink::is_full_path)
780/// flags whether it's a full store-relative path (the doctrine) versus a
781/// short-form (a validation error).
782#[derive(Debug, Clone, PartialEq, Eq)]
783pub struct WikiLink {
784    /// The link target as written, without the `[[ ]]` and without `|display`.
785    pub target: String,
786    /// The optional `|display` text override.
787    pub display: Option<String>,
788    /// True when `target` is a full store-relative path (contains a `/` and
789    /// resolves under a known layer); false for short-form targets like
790    /// `sarah-chen` — which validate reports as `WIKI_LINK_SHORT_FORM`.
791    pub is_full_path: bool,
792    /// True when `target` carries a trailing `.md` extension — validate warns
793    /// `WIKI_LINK_HAS_EXTENSION`; the canonical writers emit the bare form.
794    pub has_md_extension: bool,
795    /// Where the link appears: `(file, line, col)`, 1-based line and column.
796    pub location: (PathBuf, u32, u32),
797}
798
799/// A standard markdown link `[text](url)` — an external reference, kept in a
800/// stream separate from [`WikiLink`] so external targets are visible to the
801/// toolkit without being conflated with in-store edges. Not graph-validated.
802#[derive(Debug, Clone, PartialEq, Eq)]
803pub struct MarkdownLink {
804    /// The link text inside `[ ]`.
805    pub text: String,
806    /// The URL or path inside `( )`.
807    pub url: String,
808    /// Where the link appears: `(file, line, col)`, 1-based.
809    pub location: (PathBuf, u32, u32),
810}
811
812/// A `##`/`###` section of a markdown body: the heading text plus the byte
813/// slice of the body it spans (heading line through the line before the next
814/// heading of equal-or-shallower depth).
815#[derive(Debug, Clone, PartialEq, Eq)]
816pub struct Section {
817    /// The heading text (without the leading `#`s).
818    pub heading: String,
819    /// Heading depth (number of leading `#`s).
820    pub level: u8,
821    /// The 1-based line where the heading appears.
822    pub line: u32,
823    /// The section body, from the heading line to the next sibling-or-shallower
824    /// heading (exclusive), as a slice of the original body.
825    pub body: String,
826}
827
828/// The parsed structured content of a store's `DB.md` config file.
829///
830/// All four parts are optional in the source; absent parts fall back to spec
831/// defaults. Produced by [`parse_db_md`].
832#[derive(Debug, Clone, Default, PartialEq)]
833pub struct Config {
834    /// Body of the `## Agent instructions` section — free-form prose passed to
835    /// the agent's system prompt.
836    pub agent_instructions: Option<String>,
837    /// `## Policies` → `### Frozen pages`: store-relative paths the toolkit
838    /// refuses to write (`POLICY_FROZEN_PAGE`).
839    pub frozen_pages: Vec<PathBuf>,
840    /// `## Policies` → `### Ignored types`: type names the curator never
841    /// synthesizes (still readable as ambient context).
842    pub ignored_types: Vec<String>,
843    /// `## Schemas` → one entry per `### <type>` sub-section.
844    pub schemas: BTreeMap<String, Schema>,
845    /// `## Folders` → optional per-folder display + description, surfaced in the
846    /// root + layer `index.md` rollups. Agent-authored; the tool never invents a
847    /// folder's description (absent ⇒ the rollup shows counts only). Keyed by the
848    /// store-relative, unix-slash folder path (e.g. `records/contacts`).
849    pub folders: BTreeMap<String, FolderMeta>,
850}
851
852/// Agent-authored display + description for one type-folder, declared in
853/// `DB.md ## Folders` and surfaced in the root/layer `index.md` rollups. Both
854/// fields are optional: `display` overrides the rollup's derived folder name
855/// (for casing the tool can't guess, e.g. acronyms like HubSpot); `description`
856/// is the one-line "what's in here" the rollup shows. The tool only ever
857/// *surfaces* these — it never composes a folder description from the folder's
858/// contents (that would be the tool inventing the curator's judgment).
859#[derive(Debug, Clone, Default, PartialEq, Eq)]
860pub struct FolderMeta {
861    /// Display-name override (absent ⇒ derived from the folder basename).
862    pub display: Option<String>,
863    /// One-line folder description shown in the rollup (absent ⇒ counts only).
864    pub description: Option<String>,
865}
866
867impl Config {
868    /// The `### Frozen pages` entry that matches a store-relative `target`, if
869    /// any. The **single** frozen-page matcher every write surface must funnel
870    /// through so the policy is enforced identically on `write` / `fm set` /
871    /// `fm init` / `link` / `rename` / `format`.
872    ///
873    /// Comparison is normalized so a policy line and a write target match
874    /// regardless of incidental spelling differences:
875    /// - `/` path separators on every OS,
876    /// - a single leading `./` dropped,
877    /// - a trailing `.md` dropped on **both** sides — `parse_db_md` stores
878    ///   frozen entries verbatim, so an operator who writes the natural
879    ///   extensionless spelling (`records/decisions/q1`) must protect the file
880    ///   (`records/decisions/q1.md`) exactly as the `.md` spelling does.
881    ///
882    /// Returns the matched config entry verbatim (its original spelling) so the
883    /// caller can name it in the `POLICY_FROZEN_PAGE` refusal.
884    pub fn frozen_match(&self, target: &Path) -> Option<PathBuf> {
885        let want = normalize_frozen_path(target);
886        self.frozen_pages
887            .iter()
888            .find(|frozen| {
889                let pat = normalize_frozen_path(frozen);
890                // A literal entry matches by exact normalized equality; an entry
891                // carrying a `*`/`**` glob matches by segment-wise glob so a
892                // pattern like `records/decisions/*` actually protects the
893                // concrete files under it instead of silently failing open.
894                if pat.contains('*') {
895                    frozen_glob_matches(&pat, &want)
896                } else {
897                    pat == want
898                }
899            })
900            .cloned()
901    }
902
903    /// True if `target` (store-relative) is a frozen page. Convenience wrapper
904    /// over [`Config::frozen_match`] for callers that only need presence.
905    pub fn is_frozen(&self, target: &Path) -> bool {
906        self.frozen_match(target).is_some()
907    }
908}
909
910/// Normalize a path for frozen-page comparison: `/` separators, a leading `./`
911/// or `/` dropped, and a trailing `.md` dropped. Both the policy entry and the
912/// write target pass through this before equality/glob, so the match is
913/// separator-, `./`-, leading-`/`-, and `.md`-insensitive. Without the leading
914/// `/` drop, an operator who wrote `/records/decisions/q1.md` normalized to a
915/// path that never equals the target's `records/decisions/q1`, silently failing
916/// the freeze OPEN.
917fn normalize_frozen_path(p: &Path) -> String {
918    use std::path::Component;
919    // Keep only the `Normal` path segments, dropping `RootDir`/`Prefix` (a
920    // leading `/` or drive prefix) and `CurDir` (`.`). This is what makes a
921    // leading-slash entry (`/records/decisions/q1.md`) normalize to the same
922    // `records/decisions/q1` as the store-relative target, instead of the
923    // doubled-`//` prefix `Path::components` + naive join produced — which never
924    // equalled the target and silently failed the freeze OPEN.
925    let unix: String = p
926        .components()
927        .filter_map(|c| match c {
928            Component::Normal(s) => s.to_str(),
929            _ => None,
930        })
931        .collect::<Vec<_>>()
932        .join("/");
933    unix.strip_suffix(".md").unwrap_or(&unix).to_string()
934}
935
936/// Match a normalized frozen-page glob `pat` against a normalized target `path`,
937/// segment by segment. `*` matches any run of characters *within a single path
938/// segment* (never crossing `/`); `**` as a whole segment matches zero or more
939/// whole segments. Both sides are already `normalize_frozen_path`-normalized, so
940/// this only deals with `/`-joined segment text. Keeps the substrate dependency-
941/// free (no glob crate) while making `records/decisions/*` actually freeze the
942/// files beneath it instead of failing open.
943fn frozen_glob_matches(pat: &str, path: &str) -> bool {
944    // Collapse runs of consecutive `**` segments into a single `**` before
945    // matching: `**/**` matches exactly the same set of paths as `**`, so the
946    // duplicates carry no semantics — they only multiply the number of
947    // (star-index, path-index) splits the matcher must consider. Dropping them
948    // up front is the first half of keeping the match polynomial (the
949    // two-pointer matcher below is the second); without it, a DB.md bullet like
950    // `**/**/…/zzz` against a deep non-matching target made the old recursive
951    // matcher explore exponentially many splits and hang the entire write path.
952    let pat_segs: Vec<&str> = collapse_double_stars(pat.split('/'));
953    let path_segs: Vec<&str> = path.split('/').collect();
954    glob_segments(&pat_segs, &path_segs)
955}
956
957/// Drop every `**` segment that immediately follows another `**`, leaving at most
958/// one `**` per run. Consecutive `**` are semantically identical to a single `**`
959/// (each matches "zero or more whole segments"), so this never changes the set of
960/// matched paths — it only removes the redundant pattern positions that otherwise
961/// fuel catastrophic backtracking.
962fn collapse_double_stars<'a>(segs: impl Iterator<Item = &'a str>) -> Vec<&'a str> {
963    let mut out: Vec<&str> = Vec::new();
964    for seg in segs {
965        if seg == "**" && out.last() == Some(&"**") {
966            continue;
967        }
968        out.push(seg);
969    }
970    out
971}
972
973/// Segment matcher for [`frozen_glob_matches`]. `**` consumes any number of path
974/// segments; every other pattern segment must match exactly one path segment
975/// (with `*` wildcards inside it).
976///
977/// Implemented as the classic linear wildcard match: a single forward scan with a
978/// remembered "last `**`" backtrack point, never the two-way recursion the old
979/// version used. The old `glob_segments(rest, path)` OR `glob_segments(pat,
980/// &path[1..])` recursion had no memoization, so N consecutive `**` against a
981/// deep target that ultimately fails to match explored an exponential number of
982/// (star-index, path-index) splits — one DB.md frozen-page bullet could hang the
983/// store's whole write path. This greedy scan with backtrack is O(pat × path) in
984/// the worst case while matching exactly the same set of paths.
985fn glob_segments(pat: &[&str], path: &[&str]) -> bool {
986    let mut pi = 0usize; // cursor into pattern segments
987    let mut si = 0usize; // cursor into path segments
988                         // Backtrack point: where in the pattern the last `**` sat, and the path
989                         // position to resume from if a later literal mismatch forces the `**` to
990                         // swallow one more segment. `None` until we have seen a `**`.
991    let mut star_pi: Option<usize> = None;
992    let mut star_si = 0usize;
993
994    while si < path.len() {
995        if pi < pat.len() && pat[pi] == "**" {
996            // Record this `**` as the resume point and tentatively let it match
997            // zero segments (advance past it). If a later segment fails, we come
998            // back here and let the `**` swallow one more path segment.
999            star_pi = Some(pi);
1000            star_si = si;
1001            pi += 1;
1002        } else if pi < pat.len() && glob_segment_text(pat[pi], path[si]) {
1003            // Ordinary segment match: advance both cursors.
1004            pi += 1;
1005            si += 1;
1006        } else if let Some(sp) = star_pi {
1007            // Mismatch (or pattern exhausted) but an earlier `**` can absorb more:
1008            // resume just after that `**`, having it consume one extra segment.
1009            pi = sp + 1;
1010            star_si += 1;
1011            si = star_si;
1012        } else {
1013            // Mismatch with no `**` to fall back on.
1014            return false;
1015        }
1016    }
1017
1018    // Path consumed; any trailing pattern must be all `**` (each matching zero
1019    // segments) for a full match.
1020    while pi < pat.len() && pat[pi] == "**" {
1021        pi += 1;
1022    }
1023    pi == pat.len()
1024}
1025
1026/// Match a single glob segment against a single path segment. `*` matches any
1027/// run of characters within the segment; all other characters are literal.
1028fn glob_segment_text(pat: &str, seg: &str) -> bool {
1029    if !pat.contains('*') {
1030        return pat == seg;
1031    }
1032    // Split on `*` into literal fragments. The first fragment must be a prefix,
1033    // the last a suffix, and the middle fragments must appear in order.
1034    let parts: Vec<&str> = pat.split('*').collect();
1035    let mut pos = 0usize;
1036    for (idx, part) in parts.iter().enumerate() {
1037        if part.is_empty() {
1038            continue;
1039        }
1040        if idx == 0 {
1041            // Leading literal must be a prefix.
1042            if !seg[pos..].starts_with(part) {
1043                return false;
1044            }
1045            pos += part.len();
1046        } else if idx == parts.len() - 1 {
1047            // Trailing literal must be a suffix at or after the current cursor.
1048            return seg[pos..].ends_with(part);
1049        } else {
1050            // Interior literal: find it at or after the cursor.
1051            match seg[pos..].find(part) {
1052                Some(off) => pos += off + part.len(),
1053                None => return false,
1054            }
1055        }
1056    }
1057    true
1058}
1059
1060/// A user-declared type schema parsed from a `DB.md` `### <type>` sub-section.
1061/// The store's `## Schemas` is the **only** source of schema enforcement — the
1062/// toolkit ships no built-in or implicit per-type schema (see SPEC § Schemas).
1063#[derive(Debug, Clone, Default, PartialEq)]
1064pub struct Schema {
1065    /// One [`FieldSpec`] per bulleted field line, in source order.
1066    pub fields: Vec<FieldSpec>,
1067    /// `- unique: <field>[, <field> …]` directives — each inner vec is one
1068    /// uniqueness constraint over the listed field(s) (compound when >1). Two
1069    /// records of this type whose listed values collide warn as
1070    /// `DUP_UNIQUE_KEY`.
1071    pub unique_keys: Vec<Vec<String>>,
1072    /// `- summary_template: <template>` directive — the `{field}` interpolation
1073    /// pattern `dbmd fm init` / `dbmd write` use to compose a default `summary`
1074    /// for this type. `None` falls back to the body's first paragraph.
1075    pub summary_template: Option<String>,
1076    /// `- shard: by-date | flat` directive — whether records of this type are
1077    /// date-sharded on disk (`records/<type>/<YYYY>/<MM>/…`) or kept flat.
1078    /// `None` = no directive declared, so the store's built-in default for the
1079    /// type applies ([`crate::store::Store::type_shards`]); `Some(true)` forces
1080    /// date-sharding (e.g. a custom event type the toolkit has no built-in for);
1081    /// `Some(false)` forces flat. This is the v0.2 generic-model way to declare
1082    /// sharding — the toolkit ships no implicit per-type behavior beyond the
1083    /// example-type defaults.
1084    pub shard: Option<bool>,
1085}
1086
1087/// One field declaration inside a [`Schema`]: `- <name> (<modifiers>)`.
1088///
1089/// Modifiers are comma-separated inside the parens; this captures the
1090/// recognized ones as typed fields and stashes anything unrecognized in
1091/// [`unknown_modifiers`](FieldSpec::unknown_modifiers) (surfaced as `Info`).
1092#[derive(Debug, Clone, Default, PartialEq)]
1093pub struct FieldSpec {
1094    /// The field name.
1095    pub name: String,
1096    /// `required` modifier present.
1097    pub required: bool,
1098    /// The shape modifier (`string`/`int`/`bool`/`date`/`email`/`currency`/
1099    /// `url`), if any.
1100    pub shape: Option<Shape>,
1101    /// `link to <prefix>/` — the store-relative prefix a wiki-link target must
1102    /// start with. The trailing slash is required in the source syntax.
1103    pub link_prefix: Option<PathBuf>,
1104    /// `default <value>` — the value written when the field is absent.
1105    pub default: Option<Value>,
1106    /// `enum: <v1>, <v2>, ...` — the allowed values (must be the last modifier
1107    /// on the line because of its own commas).
1108    pub enum_values: Option<Vec<String>>,
1109    /// Any modifiers not in the recognized vocabulary, preserved verbatim;
1110    /// validate surfaces these as `Info`, never errors.
1111    pub unknown_modifiers: Vec<String>,
1112}
1113
1114/// A recognized shape modifier for a schema field. Validate enforces the
1115/// corresponding value shape (`SCHEMA_SHAPE_MISMATCH` on violation).
1116#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1117pub enum Shape {
1118    /// Any scalar string.
1119    String,
1120    /// Integer.
1121    Int,
1122    /// Boolean.
1123    Bool,
1124    /// RFC3339 / ISO-8601 date.
1125    Date,
1126    /// `<local>@<domain>` email address.
1127    Email,
1128    /// A currency amount.
1129    Currency,
1130    /// A URL.
1131    Url,
1132}
1133
1134/// The result of splitting a raw file into its frontmatter block and body.
1135///
1136/// `body` is the verbatim remainder after the closing `---` fence — the writer
1137/// preserves it byte-for-byte so operator edits are never reflowed.
1138#[derive(Debug, Clone, PartialEq, Eq)]
1139pub struct ParsedFile {
1140    /// The raw frontmatter YAML (between the fences, exclusive of them).
1141    pub frontmatter_yaml: String,
1142    /// The verbatim body (everything after the closing `---`).
1143    pub body: String,
1144}
1145
1146/// Split a file's full text into its frontmatter block and body. The
1147/// frontmatter block must be the very first thing in the file, delimited by
1148/// `---` on its own line at start and end. Returns
1149/// [`ParseError::MissingFrontmatter`] if absent.
1150pub fn split_frontmatter(text: &str, file: &Path) -> Result<ParsedFile, ParseError> {
1151    // Tolerate a single leading UTF-8 BOM (U+FEFF) before the opening fence,
1152    // matching `store::frontmatter_block` and `index::extract_frontmatter_block`
1153    // which already strip it. Without this, a BOM-prefixed file (common from
1154    // Windows / exported markdown dropped into `sources/`) gets walked and
1155    // indexed by `dbmd index` yet hard-fails every write/edit surface that
1156    // routes through `read_file` (`fm get/set`, `format`, `link`, `write`). The
1157    // BOM is dropped from the emitted body so the canonical writer never carries
1158    // it forward.
1159    let text = text.strip_prefix('\u{feff}').unwrap_or(text);
1160
1161    // The opening fence must be the very first line: `---`, no leading
1162    // whitespace, nothing before it. Trailing whitespace on the fence line is
1163    // tolerated via `trim_end()` (which strips spaces/tabs as well as CR/LF) so
1164    // this matches `index::extract_frontmatter_block` and
1165    // `validate::split_frontmatter`, both of which use `trim_end()`. Without this
1166    // agreement a fence written `--- ` (a single trailing space — invisible in an
1167    // editor, easily produced by hand edits or exporters) was indexed and
1168    // validated clean by those scanners yet hard-failed every write/edit surface
1169    // routed through `read_file` (`fm get/set`, `format`, `link`, `write`) — the
1170    // same cross-scanner drift class already fixed for the UTF-8 BOM above.
1171    let mut lines = text.split_inclusive('\n');
1172    let first = lines.next().unwrap_or("");
1173    if first.trim_end() != "---" {
1174        return Err(ParseError::MissingFrontmatter {
1175            file: file.to_path_buf(),
1176        });
1177    }
1178
1179    // Scan for the closing fence line. Track byte offsets so we can slice the
1180    // YAML (between fences, exclusive) and the body (verbatim, after the
1181    // closing fence's line terminator).
1182    let opening_len = first.len();
1183    let mut offset = opening_len;
1184    for line in lines {
1185        if line.trim_end() == "---" {
1186            let yaml = &text[opening_len..offset];
1187            let body_start = offset + line.len();
1188            let body = &text[body_start..];
1189            return Ok(ParsedFile {
1190                frontmatter_yaml: yaml.to_string(),
1191                body: body.to_string(),
1192            });
1193        }
1194        offset += line.len();
1195    }
1196
1197    // Opening fence present but no closing fence: malformed frontmatter block.
1198    Err(ParseError::MissingFrontmatter {
1199        file: file.to_path_buf(),
1200    })
1201}
1202
1203/// Read a file from disk and parse it into typed [`Frontmatter`] plus the
1204/// verbatim body string.
1205pub fn read_file(path: &Path) -> Result<(Frontmatter, String), ParseError> {
1206    let text = std::fs::read_to_string(path)?;
1207    let parsed = split_frontmatter(&text, path)?;
1208    let fm = Frontmatter::parse(&parsed.frontmatter_yaml, path)?;
1209    Ok((fm, parsed.body))
1210}
1211
1212/// Atomically write a markdown file from frontmatter + body: emit the
1213/// frontmatter in canonical key order, then the body verbatim, via a
1214/// temp-file-rename so a reader never sees a half-written file. Preserves the
1215/// operator-edited body exactly as given.
1216pub fn write_file(path: &Path, frontmatter: &Frontmatter, body: &str) -> Result<(), ParseError> {
1217    let contents = render_file(frontmatter, body);
1218
1219    // One durable, atomic write for all primary data (see `crate::fsx`):
1220    // temp-file + fsync + rename + parent-fsync. Content records are primary
1221    // data, so they get the durable path (unlike the rebuildable index).
1222    crate::fsx::write_atomic(path, contents.as_bytes())?;
1223    Ok(())
1224}
1225
1226/// Atomically create a markdown file from frontmatter + body, refusing with
1227/// [`std::io::ErrorKind::AlreadyExists`] if the destination already exists.
1228///
1229/// This is the create-new sibling of [`write_file`]: same canonical rendering
1230/// and durable temp-file path, but backed by [`crate::fsx::write_atomic_new`] so
1231/// two concurrent creators for the same path cannot both succeed.
1232pub fn write_file_new(
1233    path: &Path,
1234    frontmatter: &Frontmatter,
1235    body: &str,
1236) -> Result<(), ParseError> {
1237    let contents = render_file(frontmatter, body);
1238    crate::fsx::write_atomic_new(path, contents.as_bytes())?;
1239    Ok(())
1240}
1241
1242fn render_file(frontmatter: &Frontmatter, body: &str) -> String {
1243    let yaml = frontmatter.to_yaml();
1244    // `to_yaml` already terminates each block with a newline. Compose the file
1245    // as: opening fence, frontmatter YAML, closing fence, then body verbatim.
1246    let mut contents = String::with_capacity(yaml.len() + body.len() + 8);
1247    contents.push_str("---\n");
1248    contents.push_str(&yaml);
1249    contents.push_str("---\n");
1250    contents.push_str(body);
1251    contents
1252}
1253
1254/// Extract every wiki-link from a body (and inline frontmatter), returning the
1255/// structured [`WikiLink`] stream with short-form / `.md`-extension flags and
1256/// `(file, line, col)` locations set.
1257pub fn extract_wiki_links(body: &str, file: &Path) -> Vec<WikiLink> {
1258    static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
1259    let re = RE.get_or_init(|| {
1260        // [[target]] or [[target|display]]; target/display exclude brackets and
1261        // (for target) the `|` separator so nested forms don't over-match.
1262        regex::Regex::new(r"\[\[([^\[\]|]+?)(?:\|([^\[\]]*))?\]\]").expect("valid wiki-link regex")
1263    });
1264
1265    let mut out = Vec::new();
1266    for (line_idx, line) in body.lines().enumerate() {
1267        // Running (byte, char) cursor: derive each match's column in ONE linear
1268        // pass over the line instead of recomputing it from the line start per
1269        // match. `captures_iter` yields non-overlapping matches in increasing
1270        // byte order, so advancing the char count by the gap since the previous
1271        // match keeps the whole line O(line_len) rather than O(matches × len).
1272        let mut cursor = ColCursor::new();
1273        for caps in re.captures_iter(line) {
1274            let whole = caps.get(0).expect("group 0 always present");
1275            let col = cursor.column_at(line, whole.start());
1276            let target = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
1277            let display = caps.get(2).map(|m| m.as_str().to_string());
1278            out.push(WikiLink {
1279                is_full_path: target_is_full_path(&target),
1280                has_md_extension: target_has_md_extension(&target),
1281                target,
1282                display,
1283                location: (file.to_path_buf(), (line_idx as u32) + 1, col),
1284            });
1285        }
1286    }
1287    out
1288}
1289
1290/// Extract every standard markdown link `[text](url)` from a body into a
1291/// separate stream, kept distinct from wiki-links.
1292pub fn extract_markdown_links(body: &str, file: &Path) -> Vec<MarkdownLink> {
1293    static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
1294    let re = RE.get_or_init(|| {
1295        // [text](url). `text` excludes brackets so a wiki-link `[[x]]` (which
1296        // has `]]`, not `](`) never matches; `url` excludes `)` and whitespace.
1297        regex::Regex::new(r"\[([^\[\]]*)\]\(([^)\s]*)\)").expect("valid markdown-link regex")
1298    });
1299
1300    let mut out = Vec::new();
1301    for (line_idx, line) in body.lines().enumerate() {
1302        // One linear column cursor per line (see `extract_wiki_links`): avoids the
1303        // O(matches × line_len) recompute on a link-dense line.
1304        let mut cursor = ColCursor::new();
1305        for caps in re.captures_iter(line) {
1306            let whole = caps.get(0).expect("group 0 always present");
1307            let col = cursor.column_at(line, whole.start());
1308            out.push(MarkdownLink {
1309                text: caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string(),
1310                url: caps.get(2).map(|m| m.as_str()).unwrap_or("").to_string(),
1311                location: (file.to_path_buf(), (line_idx as u32) + 1, col),
1312            });
1313        }
1314    }
1315    out
1316}
1317
1318/// Detect the frontmatter wiki-link-list mis-encoding: a wiki-link *list*
1319/// written so YAML parses it as nested sequences instead of a clean list of
1320/// strings. Returns the offending keys so validate can emit
1321/// `WIKI_LINK_FLOW_FORM_LIST`.
1322///
1323/// The subtlety is that `[[x]]` is YAML for "a list containing `[x]`", so the
1324/// shapes nest:
1325///
1326/// - **Scalar inline** `company: [[records/x]]` → `Seq[ Seq[String] ]`
1327///   (double-nested). This is the spec's scalar wiki-link form — NOT flagged.
1328/// - **Flow list** `attendees: [[[a]], [[b]]]` → `Seq[ Seq[Seq[String]], … ]`
1329///   (triple-nested). The list mis-encoding — flagged.
1330/// - **Unquoted block list** (`- [[a]]` per line) → also triple-nested, so it
1331///   is flagged too; the canonical list form must quote each item
1332///   (`- "[[a]]"`), which parses to a clean `Seq[String, …]` and is NOT flagged.
1333///
1334/// So the discriminator is nesting depth: a *list* mis-encoding has at least one
1335/// item that is itself a sequence-of-sequences, whereas a scalar inline link's
1336/// single item is a sequence-of-scalars.
1337pub fn detect_flow_form_link_lists(frontmatter_yaml: &str) -> Vec<String> {
1338    let value: Value = match serde_norway::from_str(frontmatter_yaml) {
1339        Ok(v) => v,
1340        // Malformed YAML is FM_MALFORMED_YAML's job, not ours; report nothing.
1341        Err(_) => return Vec::new(),
1342    };
1343    let Value::Mapping(map) = value else {
1344        return Vec::new();
1345    };
1346
1347    let mut out = Vec::new();
1348    for (k, v) in &map {
1349        if let Value::Sequence(items) = v {
1350            // Triple-nesting: some outer item is a sequence that itself holds a
1351            // sequence. Scalar inline `[[x]]` is only double-nested, so it
1352            // never matches.
1353            let is_link_list = items.iter().any(|item| match item {
1354                Value::Sequence(inner) => inner.iter().any(|x| matches!(x, Value::Sequence(_))),
1355                _ => false,
1356            });
1357            if is_link_list {
1358                if let Some(key) = k.as_str() {
1359                    out.push(key.to_string());
1360                }
1361            }
1362        }
1363    }
1364    out
1365}
1366
1367/// Extract the `##`/`###` sections of a markdown body into a flat list with
1368/// body slices.
1369pub fn extract_sections(body: &str) -> Vec<Section> {
1370    // Keep each line's start so we can slice the body verbatim (exact newlines).
1371    let lines: Vec<&str> = body.split_inclusive('\n').collect();
1372
1373    // First pass: classify heading levels (0 = not a heading), honoring fenced
1374    // code blocks so a `## x` inside a ``` fence is not treated as a heading.
1375    let mut levels: Vec<u8> = Vec::with_capacity(lines.len());
1376    let mut fence: Option<(u8, usize)> = None;
1377    for line in &lines {
1378        let content = line.trim_end_matches(['\n', '\r']);
1379        if let Some(f) = fence {
1380            if is_closing_fence(content, f) {
1381                fence = None;
1382            }
1383            levels.push(0);
1384            continue;
1385        }
1386        if let Some(opened) = opening_fence(content) {
1387            fence = Some(opened);
1388            levels.push(0);
1389            continue;
1390        }
1391        levels.push(heading_level(content));
1392    }
1393
1394    // Second pass: emit `##`+ headings; each section body runs from its heading
1395    // line to the next heading at an equal-or-shallower level (exclusive).
1396    let mut sections = Vec::new();
1397    for (i, &lvl) in levels.iter().enumerate() {
1398        if lvl < 2 {
1399            continue;
1400        }
1401        let heading_line = lines[i].trim_end_matches(['\n', '\r']);
1402        let heading = heading_text(heading_line, lvl);
1403
1404        let mut end = lines.len();
1405        for (j, &other) in levels.iter().enumerate().skip(i + 1) {
1406            if other != 0 && other <= lvl {
1407                end = j;
1408                break;
1409            }
1410        }
1411
1412        sections.push(Section {
1413            heading,
1414            level: lvl,
1415            line: (i + 1) as u32,
1416            body: lines[i..end].concat(),
1417        });
1418    }
1419    sections
1420}
1421
1422/// Extract the `##`/`###` sections of a **whole file** (frontmatter + body),
1423/// returning each [`Section`] with `line` numbered against the *source file*,
1424/// not the body.
1425///
1426/// [`extract_sections`] numbers headings 1-based within the body it is handed —
1427/// the right frame for callers that already track the frontmatter offset
1428/// (`validate` adds `fm_end_line`). But the single-file views (`dbmd sections`,
1429/// `dbmd outline`) present `Section::line` as a source line an agent can jump to;
1430/// because every db.md file opens with a frontmatter block, the body-relative
1431/// number is off by the block's length (`opening fence + frontmatter lines +
1432/// closing fence`) for every file. This helper does the offset once, in the
1433/// parser, so those surfaces report true file lines. A file with no leading
1434/// frontmatter block is treated as all-body (offset 0), so the function never
1435/// fails just because a file lacks frontmatter.
1436pub fn extract_sections_in_file(text: &str) -> Vec<Section> {
1437    // Tolerate a leading BOM the same way `split_frontmatter` does, so the line
1438    // count and the body slice agree with the read path.
1439    let text = text.strip_prefix('\u{feff}').unwrap_or(text);
1440
1441    // Find the body and how many source lines precede it. The body begins right
1442    // after the closing fence; the number of lines consumed by the frontmatter
1443    // block (both fences + the YAML between) is the offset to add to each
1444    // body-relative heading line.
1445    let (body, offset) = match split_frontmatter(text, Path::new("<sections>")) {
1446        Ok(parsed) => {
1447            // Lines before the body = total lines in `text` minus lines in body.
1448            let total_lines = count_lines(text);
1449            let body_lines = count_lines(&parsed.body);
1450            (parsed.body, total_lines.saturating_sub(body_lines))
1451        }
1452        // No frontmatter block: the whole text is body, no offset.
1453        Err(_) => (text.to_string(), 0),
1454    };
1455
1456    let mut sections = extract_sections(&body);
1457    for s in &mut sections {
1458        s.line += offset;
1459    }
1460    sections
1461}
1462
1463/// Count the number of lines a string spans for line-number offsetting: one line
1464/// per `\n`, plus one more for a final line with no trailing newline. An empty
1465/// string is zero lines.
1466fn count_lines(s: &str) -> u32 {
1467    if s.is_empty() {
1468        return 0;
1469    }
1470    let newlines = s.bytes().filter(|&b| b == b'\n').count() as u32;
1471    if s.ends_with('\n') {
1472        newlines
1473    } else {
1474        newlines + 1
1475    }
1476}
1477
1478/// Parse a store's `DB.md` file into a [`Config`]: the `## Agent instructions`
1479/// prose, `## Policies` (`### Frozen pages` + `### Ignored types`), and
1480/// `## Schemas` (`### <type>` field-bullet blocks). Unrecognized sections are
1481/// ignored; absent sections leave their [`Config`] fields at default.
1482pub fn parse_db_md(text: &str, file: &Path) -> Result<Config, ParseError> {
1483    // The structured sections live in the body (after frontmatter). DB.md must
1484    // still start with a valid `---` block (`type: db-md`); if it's missing we
1485    // surface MissingFrontmatter like any other file.
1486    let parsed = split_frontmatter(text, file)?;
1487    let _frontmatter = Frontmatter::parse(&parsed.frontmatter_yaml, file)?;
1488    let sections = extract_sections(&parsed.body);
1489
1490    let mut config = Config::default();
1491    // Track which H2 region each H3 belongs to as we walk the flat list.
1492    let mut current_h2: Option<String> = None;
1493
1494    for section in &sections {
1495        match section.level {
1496            2 => {
1497                let name = section.heading.trim().to_ascii_lowercase();
1498                current_h2 = Some(name.clone());
1499                if name == "agent instructions" {
1500                    let prose = section_prose(&section.body);
1501                    if !prose.is_empty() {
1502                        config.agent_instructions = Some(prose);
1503                    }
1504                } else if name == "folders" {
1505                    // `## Folders` carries its bullets directly under the H2 (no
1506                    // `### <type>` sub-sections), like `## Agent instructions`.
1507                    for b in bullet_lines(&section.body) {
1508                        if let Some((path, meta)) = parse_folder_bullet(&b) {
1509                            config.folders.insert(path, meta);
1510                        }
1511                    }
1512                }
1513            }
1514            3 => {
1515                let h2 = current_h2.as_deref().unwrap_or("");
1516                let h3 = section.heading.trim().to_ascii_lowercase();
1517                match (h2, h3.as_str()) {
1518                    ("policies", "frozen pages") => {
1519                        config.frozen_pages = bullet_lines(&section.body)
1520                            .into_iter()
1521                            .map(|b| PathBuf::from(extract_path_bullet(&b)))
1522                            .collect();
1523                    }
1524                    ("policies", "ignored types") => {
1525                        config.ignored_types = bullet_lines(&section.body)
1526                            .into_iter()
1527                            .flat_map(|b| extract_type_list_bullet(&b))
1528                            .collect();
1529                    }
1530                    ("schemas", _) => {
1531                        // The H3 heading text (as written) is the type name.
1532                        let type_name = section.heading.trim().to_string();
1533                        let mut schema = Schema::default();
1534                        for b in bullet_lines(&section.body) {
1535                            match parse_schema_bullet(&b) {
1536                                SchemaBullet::Field(f) => schema.fields.push(f),
1537                                SchemaBullet::Unique(k) if !k.is_empty() => {
1538                                    schema.unique_keys.push(k)
1539                                }
1540                                SchemaBullet::SummaryTemplate(t) if !t.is_empty() => {
1541                                    schema.summary_template = Some(t)
1542                                }
1543                                SchemaBullet::Shard(Some(b)) => schema.shard = Some(b),
1544                                // Empty `unique:`/`summary_template:`, or a `shard:`
1545                                // with an unrecognized value — ignored.
1546                                SchemaBullet::Unique(_)
1547                                | SchemaBullet::SummaryTemplate(_)
1548                                | SchemaBullet::Shard(None) => {}
1549                            }
1550                        }
1551                        config.schemas.insert(type_name, schema);
1552                    }
1553                    _ => {}
1554                }
1555            }
1556            _ => {}
1557        }
1558    }
1559
1560    Ok(config)
1561}
1562
1563/// One parsed bullet inside a `### <type>` schema block: an ordinary field, or a
1564/// reserved directive (`unique:` / `summary_template:` / `shard:`). The names
1565/// `unique`, `summary_template`, and `shard` are reserved and cannot be used as
1566/// field names.
1567#[derive(Debug)]
1568enum SchemaBullet {
1569    /// An ordinary `- <name> (<modifiers>)` field.
1570    Field(FieldSpec),
1571    /// `- unique: <field>[, <field> …]` — a (possibly compound) uniqueness key.
1572    Unique(Vec<String>),
1573    /// `- summary_template: <template>` — the default-`summary` pattern.
1574    SummaryTemplate(String),
1575    /// `- shard: by-date | flat` — date-shard records of this type, or keep them
1576    /// flat. `None` = an unrecognized value, ignored like an unknown modifier.
1577    Shard(Option<bool>),
1578}
1579
1580/// Classify one `## Schemas` bullet as a directive or a field. The directive
1581/// forms are `- unique: a, b, …` and `- summary_template: …`; the keyword check
1582/// guards against false positives — a field like `- status (enum: a, b)` has a
1583/// `(` before any `:`, so its head isn't a bare reserved keyword and it parses
1584/// as a [`FieldSpec`].
1585fn parse_schema_bullet(bullet_line: &str) -> SchemaBullet {
1586    let line = bullet_line.trim();
1587    let line = line
1588        .strip_prefix("- ")
1589        .or_else(|| line.strip_prefix("* "))
1590        .or_else(|| line.strip_prefix("+ "))
1591        .or_else(|| line.strip_prefix('-'))
1592        .unwrap_or(line)
1593        .trim();
1594
1595    if let Some((head, rest)) = line.split_once(':') {
1596        match head.trim().to_ascii_lowercase().as_str() {
1597            "unique" => {
1598                let fields = rest
1599                    .split(',')
1600                    .map(|f| f.trim().to_string())
1601                    .filter(|f| !f.is_empty())
1602                    .collect();
1603                return SchemaBullet::Unique(fields);
1604            }
1605            "summary_template" => {
1606                return SchemaBullet::SummaryTemplate(rest.trim().to_string());
1607            }
1608            "shard" => {
1609                // `by-date` (synonyms: date/sharded/true) enables date-sharding;
1610                // `flat` (none/false) forces flat; anything else is ignored.
1611                let v = match rest.trim().to_ascii_lowercase().as_str() {
1612                    "by-date" | "date" | "sharded" | "true" => Some(true),
1613                    "flat" | "none" | "false" => Some(false),
1614                    _ => None,
1615                };
1616                return SchemaBullet::Shard(v);
1617            }
1618            _ => {}
1619        }
1620    }
1621
1622    SchemaBullet::Field(parse_field_spec(bullet_line))
1623}
1624
1625/// Parse one `## Folders` bullet — `- <path>[|<display>] — <description>` — into
1626/// the folder path (store-relative, unix-slash, no trailing slash) and its
1627/// [`FolderMeta`]. The optional `|<display>` overrides the rollup's derived
1628/// folder name (mirroring the wiki-link `|display` convention); the text after
1629/// the first em-dash (`—`), or ` - `, is the description. Backticks around the
1630/// path are tolerated (matching the `### Frozen pages` spelling). Returns `None`
1631/// for a bullet with no usable path.
1632fn parse_folder_bullet(bullet_line: &str) -> Option<(String, FolderMeta)> {
1633    let line = bullet_line.trim();
1634    let line = line
1635        .strip_prefix("- ")
1636        .or_else(|| line.strip_prefix("* "))
1637        .or_else(|| line.strip_prefix("+ "))
1638        .or_else(|| line.strip_prefix('-'))
1639        .unwrap_or(line)
1640        .trim();
1641
1642    // Split off the description at the first em-dash (preferred, matching the
1643    // rollup's own ` — ` separator) or a ` - ` fallback.
1644    let (pathspec, description) = match line.find('—') {
1645        Some(i) => (line[..i].trim(), Some(line[i + '—'.len_utf8()..].trim())),
1646        None => match line.find(" - ") {
1647            Some(i) => (line[..i].trim(), Some(line[i + 3..].trim())),
1648            None => (line, None),
1649        },
1650    };
1651
1652    // Optional `|display` override lives on the path side.
1653    let (path_raw, display) = match pathspec.split_once('|') {
1654        Some((p, d)) => (p.trim(), Some(d.trim())),
1655        None => (pathspec, None),
1656    };
1657
1658    // Normalize the path: drop surrounding backticks, a leading `./`, a trailing `/`.
1659    let path = path_raw.trim().trim_matches('`').trim();
1660    let path = path.strip_prefix("./").unwrap_or(path);
1661    let path = path.strip_suffix('/').unwrap_or(path).trim();
1662    if path.is_empty() {
1663        return None;
1664    }
1665
1666    let non_empty = |s: &str| {
1667        let t = s.trim();
1668        (!t.is_empty()).then(|| t.to_string())
1669    };
1670    Some((
1671        path.to_string(),
1672        FolderMeta {
1673            display: display.and_then(non_empty),
1674            description: description.and_then(non_empty),
1675        },
1676    ))
1677}
1678
1679/// Parse a single `## Schemas` field-bullet line — `- <name> (<modifiers>)` —
1680/// into a [`FieldSpec`], capturing recognized modifiers and stashing the rest
1681/// in [`FieldSpec::unknown_modifiers`].
1682pub fn parse_field_spec(bullet_line: &str) -> FieldSpec {
1683    // Strip the leading bullet marker (`- ` / `* ` / `+ `) and surrounding ws.
1684    let line = bullet_line.trim();
1685    let line = line
1686        .strip_prefix("- ")
1687        .or_else(|| line.strip_prefix("* "))
1688        .or_else(|| line.strip_prefix("+ "))
1689        .or_else(|| line.strip_prefix('-'))
1690        .unwrap_or(line)
1691        .trim();
1692
1693    // Split `<name> (<modifiers>)` — the canonical paren form — OR the natural
1694    // mis-spelling `<name>: <modifiers>` (colon instead of parens). The two
1695    // delimiters are interchangeable for the field head; whichever appears FIRST
1696    // wins, so a paren form whose modifiers contain a colon (`status (enum: a,
1697    // b)`) still parses by parens (the `(` precedes the `:`), while a bare
1698    // `title: string, required` parses by colon instead of being swallowed whole
1699    // into the field name with every modifier silently dropped.
1700    let paren = line.find('(');
1701    let colon = line.find(':');
1702    // Choose the head delimiter. The paren form wins when its `(` precedes any
1703    // `:` (so `status (enum: a, b)` parses by parens, the colon being inside the
1704    // modifiers); otherwise a `:` before the paren — or with no paren at all —
1705    // selects the colon form `<name>: <modifiers>`, the natural mis-spelling that
1706    // must NOT be swallowed whole into the field name with every modifier lost.
1707    let use_paren = matches!((paren, colon), (Some(p), c) if c.is_none_or(|c| p < c));
1708    let (name, modifiers) = if use_paren {
1709        let open = paren.expect("use_paren implies a paren");
1710        let name = line[..open].trim().to_string();
1711        let after = &line[open + 1..];
1712        let mods = match after.rfind(')') {
1713            Some(close) => &after[..close],
1714            None => after, // tolerate a missing close paren
1715        };
1716        (name, mods.trim())
1717    } else if let Some(c) = colon {
1718        // Colon form: everything after the first colon is the modifier list,
1719        // parsed identically to the parenthesized modifiers below.
1720        let name = line[..c].trim().to_string();
1721        (name, line[c + 1..].trim())
1722    } else {
1723        // Neither delimiter: a free-form optional field of any shape — name only.
1724        (line.to_string(), "")
1725    };
1726
1727    let mut spec = FieldSpec {
1728        name,
1729        ..FieldSpec::default()
1730    };
1731
1732    if modifiers.is_empty() {
1733        return spec;
1734    }
1735
1736    // Modifiers are comma-separated. `enum` and `default` are special: their own
1737    // values may contain commas, so each is a *greedy* clause that runs from its
1738    // keyword to the start of the next recognized greedy clause (or end of line).
1739    // This lets `default North America, EMEA fallback` keep its comma and lets a
1740    // `default …` written after an `enum …` still be recognized, instead of the
1741    // value being truncated at the first comma or absorbed into the enum list.
1742    let raw: Vec<&str> = modifiers.split(',').collect();
1743    let mut i = 0;
1744    while i < raw.len() {
1745        let token = raw[i].trim();
1746        if token.is_empty() {
1747            i += 1;
1748            continue;
1749        }
1750        let lower = token.to_ascii_lowercase();
1751
1752        if lower == "required" {
1753            spec.required = true;
1754            i += 1;
1755        } else if let Some(shape) = shape_from_str(&lower) {
1756            spec.shape = Some(shape);
1757            i += 1;
1758        } else if let Some(rest) = lower.strip_prefix("link to ") {
1759            // The trailing slash is required in the source; store the prefix
1760            // without it so `Path::starts_with` comparisons are clean.
1761            let prefix = token["link to ".len()..].trim().trim_end_matches('/');
1762            let _ = rest; // lowercase form only used for the keyword match
1763            spec.link_prefix = Some(PathBuf::from(prefix));
1764            i += 1;
1765        } else if token.len() >= "default ".len() && lower.starts_with("default ") {
1766            // Greedy `default <value>`: the value is this token (after the
1767            // keyword) plus every following comma-token up to the next greedy
1768            // clause, rejoined with the commas the split removed — so a comma
1769            // inside the default value is preserved. Original case is kept.
1770            let end = next_greedy_clause(&raw, i + 1);
1771            let mut value = token["default ".len()..].to_string();
1772            for tok in &raw[i + 1..end] {
1773                value.push(',');
1774                value.push_str(tok);
1775            }
1776            spec.default = Some(Value::String(value.trim().to_string()));
1777            i = end;
1778        } else if lower == "enum" || lower.starts_with("enum:") {
1779            // Greedy `enum` (bare `enum, a, b` or `enum: a, b`): the values run
1780            // from here to the next greedy clause (e.g. a trailing `default …`),
1781            // NOT unconditionally to end-of-line — so a `default` after `enum` is
1782            // parsed instead of swallowed as a bogus enum member.
1783            let end = next_greedy_clause(&raw, i + 1);
1784            // Rejoin this clause's tokens (trimmed so the `enum` head sits at the
1785            // start), drop the leading `enum`/`enum:` head, then re-split the
1786            // remainder into values.
1787            let joined = raw[i..end].join(",");
1788            let joined = joined.trim();
1789            let after_kw = match joined.find(':') {
1790                // `enum: a, b` — values follow the colon.
1791                Some(colon) => &joined[colon + 1..],
1792                // bare `enum, a, b` — values follow the keyword itself.
1793                None => joined.get("enum".len()..).unwrap_or(""),
1794            };
1795            let values: Vec<String> = after_kw
1796                .split(',')
1797                .map(|v| v.trim().to_string())
1798                .filter(|v| !v.is_empty())
1799                .collect();
1800            spec.enum_values = Some(values);
1801            i = end;
1802        } else {
1803            // Unrecognized modifier — captured verbatim, surfaced as Info.
1804            spec.unknown_modifiers.push(token.to_string());
1805            i += 1;
1806        }
1807    }
1808
1809    spec
1810}
1811
1812// ── Private helpers ─────────────────────────────────────────────────────────
1813
1814/// Parse a frontmatter timestamp value into a `DateTime<FixedOffset>`. A `null`
1815/// is treated as absent; anything else must be an RFC3339 string.
1816fn parse_timestamp(
1817    value: &Value,
1818    key: &str,
1819    file: &Path,
1820) -> Result<Option<DateTime<FixedOffset>>, ParseError> {
1821    match value {
1822        Value::Null => Ok(None),
1823        Value::String(s) => parse_rfc3339(s, key, file).map(Some),
1824        other => Err(ParseError::BadTimestamp {
1825            file: file.to_path_buf(),
1826            key: key.to_string(),
1827            value: format!("{other:?}"),
1828        }),
1829    }
1830}
1831
1832/// Parse an RFC3339 timestamp string, mapping failure to [`ParseError::BadTimestamp`].
1833fn parse_rfc3339(s: &str, key: &str, file: &Path) -> Result<DateTime<FixedOffset>, ParseError> {
1834    DateTime::parse_from_rfc3339(s.trim()).map_err(|_| ParseError::BadTimestamp {
1835        file: file.to_path_buf(),
1836        key: key.to_string(),
1837        value: s.to_string(),
1838    })
1839}
1840
1841/// Coerce a YAML scalar value to its string form for the universal-contract
1842/// fields (`type`/`id`/`summary`/`status`). Mirrors `validate::scalar_string`
1843/// and `store::yaml_scalar_string` so the four modules agree on one coercion
1844/// rule: a bare numeric/bool scalar (`id: 100`, `summary: 2026`, `status: 0`)
1845/// is preserved as its string form rather than being read as None and silently
1846/// dropped on the next `to_yaml` re-emit. Returns `None` only for genuinely
1847/// non-scalar values (sequences, mappings, null), which were never a valid
1848/// shape for these fields.
1849fn scalar_string(value: &Value) -> Option<String> {
1850    match value {
1851        Value::String(s) => Some(s.clone()),
1852        Value::Number(n) => Some(n.to_string()),
1853        Value::Bool(b) => Some(b.to_string()),
1854        _ => None,
1855    }
1856}
1857
1858/// Read a `tags` value into a flat `Vec<String>`. Accepts a sequence of scalars
1859/// (the canonical form) or a single scalar (coerced to a one-element list).
1860fn parse_tags(value: &Value) -> Vec<String> {
1861    match value {
1862        Value::Sequence(items) => items
1863            .iter()
1864            .filter_map(|v| match v {
1865                Value::String(s) => Some(s.clone()),
1866                Value::Number(n) => Some(n.to_string()),
1867                Value::Bool(b) => Some(b.to_string()),
1868                _ => None,
1869            })
1870            .collect(),
1871        Value::String(s) => vec![s.clone()],
1872        _ => Vec::new(),
1873    }
1874}
1875
1876/// Read a `tags` value into a flat `Vec<String>` **without losing data**: a
1877/// sequence of clean scalars (the canonical form) or a single scalar coerce to a
1878/// string list. Any other shape — a sequence with a non-scalar item
1879/// (`tags: [[vip]]` → `Seq[Seq[String]]`, `tags: [a, [b]]`), or a mapping — is
1880/// rejected as `Err(value.clone())` so the caller preserves the raw value in
1881/// `extra` rather than silently filtering items out / erasing the field on the
1882/// next re-emit. This is the `tags` analog of routing a non-scalar universal
1883/// value to pass-through instead of the destroy path.
1884fn parse_tags_preserving(value: &Value) -> Result<Vec<String>, Value> {
1885    match value {
1886        Value::Sequence(items) => {
1887            let mut out = Vec::with_capacity(items.len());
1888            for item in items {
1889                match item {
1890                    Value::String(s) => out.push(s.clone()),
1891                    Value::Number(n) => out.push(n.to_string()),
1892                    Value::Bool(b) => out.push(b.to_string()),
1893                    // A non-scalar item (nested sequence/mapping/null) means this
1894                    // is not a clean tag list; preserve the whole value verbatim.
1895                    _ => return Err(value.clone()),
1896                }
1897            }
1898            Ok(out)
1899        }
1900        Value::String(s) => Ok(vec![s.clone()]),
1901        Value::Number(n) => Ok(vec![n.to_string()]),
1902        Value::Bool(b) => Ok(vec![b.to_string()]),
1903        // A mapping / null `tags` value is not a list; preserve it verbatim.
1904        _ => Err(value.clone()),
1905    }
1906}
1907
1908/// Render a non-string YAML mapping key as the scalar text YAML would emit for
1909/// it (`2026`, `true`, `3.14`, …), so a numeric/bool/float frontmatter key
1910/// preserves its key *text* on round-trip instead of being rewritten to its Rust
1911/// `Debug` form (`Number(2026)`, `Bool(true)`, `'Null'`). The key re-emits as a
1912/// string-typed key carrying the original text (`'2026':`) — the type narrows to
1913/// string, but the operator's data is no longer corrupted, and ordinary string
1914/// keys are wholly unaffected. Falls back to `Debug` only for a key shape that
1915/// cannot be a scalar (a sequence/mapping key — not expressible in our
1916/// `String`-keyed `extra`), which never occurs in practice.
1917fn yaml_scalar_key(key: &Value) -> String {
1918    match key {
1919        Value::String(s) => s.clone(),
1920        Value::Number(n) => n.to_string(),
1921        Value::Bool(b) => b.to_string(),
1922        Value::Null => "null".to_string(),
1923        // Non-scalar key: not representable as a plain `extra` string key; keep
1924        // the defensive Debug form so nothing panics (unreachable in practice).
1925        other => format!("{other:?}"),
1926    }
1927}
1928
1929/// Parse a single `[[target|display]]` string into a [`WikiLink`] with no
1930/// location, or `None` if the string is not a bare wiki-link. Used for
1931/// frontmatter-valued links where there is no body position to report.
1932fn parse_wiki_link_str(s: &str) -> Option<WikiLink> {
1933    let s = s.trim();
1934    let inner = s.strip_prefix("[[")?.strip_suffix("]]")?;
1935    // Reject anything with further brackets (e.g. the nested flow-form item),
1936    // which is not a clean single wiki-link.
1937    if inner.contains('[') || inner.contains(']') {
1938        return None;
1939    }
1940    let (target, display) = match inner.split_once('|') {
1941        Some((t, d)) => (t.to_string(), Some(d.to_string())),
1942        None => (inner.to_string(), None),
1943    };
1944    Some(WikiLink {
1945        is_full_path: target_is_full_path(&target),
1946        has_md_extension: target_has_md_extension(&target),
1947        target,
1948        display,
1949        location: (PathBuf::new(), 0, 0),
1950    })
1951}
1952
1953/// Extract every wiki-link from a single frontmatter field value, accepting the
1954/// two canonical forms the spec defines (SPEC § Linking):
1955///
1956/// - a **scalar** wiki-link field, in either the quoted (`f: "[[x]]"`) or the
1957///   canonical unquoted inline (`f: [[x]]`) form, and
1958/// - a **list** field whose items are quoted wiki-link strings
1959///   (`- "[[x]]"`).
1960///
1961/// YAML eats the brackets of an unquoted `[[x]]`, leaving a flow-list-in-a-list,
1962/// so the parsed [`Value`] shapes are not what one would naively expect:
1963///
1964/// | source                         | parsed `Value`                     | here |
1965/// |--------------------------------|------------------------------------|------|
1966/// | `f: "[[x]]"`       (quoted)    | `String("[[x]]")`                  | link |
1967/// | `f: [[x]]`         (unquoted)  | `Seq[ Seq[String("x")] ]`          | link |
1968/// | `f:`\n`  - "[[x]]"`(quoted)    | `Seq[ String("[[x]]"), … ]`        | link |
1969/// | `f:`\n`  - [[x]]`  (unquoted)  | `Seq[ Seq[Seq[String("x")]], … ]`  | —    |
1970///
1971/// The last row — an *unquoted list* — parses identically to the flow-form list
1972/// `f: [[a], [b]]` and is a mis-encoding the canonical writer never emits;
1973/// `dbmd validate` reports it as `WIKI_LINK_FLOW_FORM_LIST` (see
1974/// [`detect_flow_form_link_lists`]). It is deliberately NOT surfaced here, so an
1975/// edge enumerator only ever sees the valid canonical forms.
1976///
1977/// The unquoted scalar (`Seq[Seq[String]]`, one element) is told apart from a
1978/// plain one-item flow list (`f: [x]` → `Seq[String]`, one fewer nesting level)
1979/// by [`unquoted_inline_link`] requiring its argument to be a `Sequence`.
1980fn links_in_field_value(value: &Value) -> Vec<WikiLink> {
1981    // Quoted scalar: `field: "[[x]]"`.
1982    if let Value::String(s) = value {
1983        return parse_wiki_link_str(s).into_iter().collect();
1984    }
1985    let Value::Sequence(items) = value else {
1986        return Vec::new();
1987    };
1988    // Unquoted scalar inline form `field: [[x]]` → `Seq[ Seq[String(x)] ]`.
1989    // (A quoted single-item list `["[[x]]"]` is `Seq[String]`, so its lone item
1990    // is a `String`, not a `Sequence`, and falls through to the list path below.)
1991    if items.len() == 1 {
1992        if let Some(link) = unquoted_inline_link(&items[0]) {
1993            return vec![link];
1994        }
1995    }
1996    // Otherwise a list of quoted wiki-link strings; non-string items (the
1997    // unquoted-list mis-encoding) are left for validate to flag.
1998    items
1999        .iter()
2000        .filter_map(|item| parse_wiki_link_str(item.as_str()?))
2001        .collect()
2002}
2003
2004/// Canonicalize one `extra` frontmatter value for emission by [`Frontmatter::to_yaml`].
2005///
2006/// The read path ([`Frontmatter::parse`]) stores every unknown key's raw parsed
2007/// [`Value`] verbatim, so a SPEC-canonical *unquoted* inline scalar wiki-link
2008/// (`company: [[records/companies/northstar]]`) lands in `extra` as the nested
2009/// shape YAML produces for it — `Seq[ Seq[String("records/companies/northstar")] ]`.
2010/// Re-emitting that verbatim yields the block sequence
2011///
2012/// ```text
2013/// company:
2014/// - - records/companies/northstar
2015/// ```
2016///
2017/// which has lost the `[[ ]]` brackets entirely: the link is destroyed, and every
2018/// reader (validate, graph, backlinks) stops seeing the edge. This normalizes such
2019/// a value back into the canonical emitted form before it is written:
2020///
2021/// - a **scalar** wiki-link (quoted `String("[[x]]")` or unquoted `Seq[Seq[String]]`,
2022///   one element) → a quoted scalar `Value::String("[[x]]")`, which serde_norway emits
2023///   inline as `'[[x]]'` — the form the finding confirms survives a round-trip and
2024///   that [`links_in_field_value`] reads back as the same scalar link;
2025/// - a **list** of wiki-links (in any spelling [`links_in_field_value`] accepts) →
2026///   a block `Value::Sequence` of quoted-link strings (`- "[[x]]"`), matching the
2027///   `set` write-in path and the canonical list form;
2028/// - everything else → returned verbatim (the common no-op for non-link values).
2029///
2030/// `|display` is preserved in both link branches. This is the single point that
2031/// keeps all three curator-loop writers (`format`, `fm set`, `link`) from
2032/// corrupting a pre-existing canonical link, since they all funnel through
2033/// `to_yaml`.
2034fn canonicalize_extra_value(value: &Value) -> Value {
2035    match value {
2036        // Scalar wiki-link, quoted form: `field: "[[x]]"` → `String("[[x]]")`.
2037        // Re-emit as a quoted scalar so it stays a string (never the brackets-as-
2038        // YAML nested sequence). Non-link strings are returned untouched.
2039        Value::String(s) => match parse_wiki_link_str(s) {
2040            Some(link) => Value::String(wiki_link_literal(&link)),
2041            None => value.clone(),
2042        },
2043        Value::Sequence(items) => {
2044            // NOTE: we deliberately do NOT collapse a one-element
2045            // `Seq[ Seq[String(x)] ]` to the scalar `String("[[x]]")` here. That
2046            // shape is ambiguous — `serde_norway` parses BOTH an inline scalar
2047            // wiki-link `field: [[x]]` AND a genuine 2D array `field:`\n`- - x`
2048            // to exactly that value, so collapsing it silently retyped a real
2049            // nested array (`matrix: [["cell"]]`) into the string `'[[cell]]'`
2050            // and the file stopped round-tripping. The two cases ARE
2051            // distinguishable, but only from the source text, so the genuine
2052            // inline-link case is resolved at parse time
2053            // ([`Frontmatter::parse`] → [`inline_scalar_link_keys`]), where it is
2054            // stored as a `String("[[x]]")` and handled by the arm above. By the
2055            // time a `Seq[Seq[String]]` reaches here it is a real nested array and
2056            // must pass through verbatim (SPEC § "Unknown fields pass through").
2057            // List of wiki-links: re-emit as a block sequence of quoted-link
2058            // strings, the canonical list form `to_yaml` renders block-style and
2059            // `links_in_field_value` accepts. Only canonicalize when *every* item
2060            // is a clean single wiki-link; a list with any non-link item is left
2061            // verbatim so unrelated sequences (and the unquoted-list mis-encoding
2062            // validate flags) are untouched.
2063            let mut links = Vec::with_capacity(items.len());
2064            for item in items {
2065                match link_from_flow_list_item(item) {
2066                    Some(link) => links.push(link),
2067                    None => return value.clone(),
2068                }
2069            }
2070            if links.is_empty() {
2071                return value.clone();
2072            }
2073            Value::Sequence(
2074                links
2075                    .iter()
2076                    .map(|l| Value::String(wiki_link_literal(l)))
2077                    .collect(),
2078            )
2079        }
2080        // Mappings, scalars other than strings, nulls: nothing to canonicalize.
2081        _ => value.clone(),
2082    }
2083}
2084
2085/// Render a [`WikiLink`] back to its `[[target]]` / `[[target|display]]` literal,
2086/// the inner form the canonical writer emits and `links_in_field_value` accepts.
2087fn wiki_link_literal(link: &WikiLink) -> String {
2088    match &link.display {
2089        Some(d) => format!("[[{}|{}]]", link.target, d),
2090        None => format!("[[{}]]", link.target),
2091    }
2092}
2093
2094/// Recognize the inner token of an unquoted scalar `[[x]]`: after YAML strips the
2095/// outer brackets, the inner `[x]` is a single-element sequence `Seq[String(x)]`.
2096/// Reconstructs `[[x]]` (preserving any `|display`) and parses it, or returns
2097/// `None` when `v` is not that shape. Requiring a `Sequence` here is what keeps a
2098/// plain one-item flow list (`field: [x]` → `Seq[String]`, not `Seq[Seq[String]]`)
2099/// from being mistaken for a wiki-link.
2100fn unquoted_inline_link(v: &Value) -> Option<WikiLink> {
2101    let Value::Sequence(items) = v else {
2102        return None;
2103    };
2104    if items.len() != 1 {
2105        return None;
2106    }
2107    let s = items[0].as_str()?;
2108    // A clean unquoted wiki-link has no further brackets inside it.
2109    if s.contains('[') || s.contains(']') {
2110        return None;
2111    }
2112    parse_wiki_link_str(&format!("[[{s}]]"))
2113}
2114
2115/// Scan raw frontmatter YAML for top-level keys whose value is written in the
2116/// **inline scalar wiki-link** form `key: [[target]]` (optionally
2117/// `[[target|display]]`).
2118///
2119/// This is the one disambiguation the parsed [`Value`] cannot supply on its own:
2120/// `serde_norway` parses BOTH
2121///
2122/// ```yaml
2123/// field: [[x]]
2124/// ```
2125///
2126/// and
2127///
2128/// ```yaml
2129/// field:
2130/// - - x
2131/// ```
2132///
2133/// to the identical `Seq[ Seq[String("x")] ]`. Only the source text says which one
2134/// the operator wrote. [`Frontmatter::parse`] calls this and rewrites the inline
2135/// cases to the canonical scalar `String("[[x]]")`, leaving every genuine nested
2136/// array a sequence (preserved verbatim per SPEC § "Unknown fields pass through").
2137///
2138/// Conservative by construction: a key is reported only when, on a single
2139/// top-level (zero-indent) line, the value after the first `:` is *exactly* one
2140/// `[[…]]` token (whitespace and an optional trailing `# comment` aside) with no
2141/// nested brackets inside. A quoted value (`field: "[[x]]"`), a flow list
2142/// (`field: [[a], [b]]`), a block sequence, or any indented/multi-token value is
2143/// left for the normal parse path. Duplicate keys (last-wins in YAML) are handled
2144/// by the caller looking up the final stored value.
2145fn inline_scalar_link_keys(yaml: &str) -> Vec<String> {
2146    let mut keys = Vec::new();
2147    for line in yaml.lines() {
2148        // Only top-level keys: an indented line is a nested mapping/sequence
2149        // entry, never a top-level `key: [[x]]` scalar.
2150        if line.starts_with(' ') || line.starts_with('\t') {
2151            continue;
2152        }
2153        let Some((raw_key, raw_val)) = line.split_once(':') else {
2154            continue;
2155        };
2156        let key = raw_key.trim();
2157        if key.is_empty() {
2158            continue;
2159        }
2160        // Drop a trailing `# comment` (YAML allows one after a plain scalar on the
2161        // same line). A `#` inside the bracketed link target is not a comment, but
2162        // such a target is rejected below anyway (it would not be a clean link).
2163        let val = match raw_val.split_once(" #") {
2164            Some((before, _)) => before.trim(),
2165            None => raw_val.trim(),
2166        };
2167        // The value must be exactly one bracket-delimited `[[…]]` token: starts
2168        // with `[[`, ends with `]]`, and the inner text carries no further
2169        // brackets (which would make it a flow list / nested collection, not a
2170        // single inline wiki-link).
2171        let Some(inner) = val.strip_prefix("[[").and_then(|s| s.strip_suffix("]]")) else {
2172            continue;
2173        };
2174        if inner.contains('[') || inner.contains(']') {
2175            continue;
2176        }
2177        // Confirm it is actually a parseable wiki-link, not e.g. an empty `[[]]`.
2178        if parse_wiki_link_str(val).is_some() {
2179            keys.push(key.to_string());
2180        }
2181    }
2182    keys
2183}
2184
2185/// Decide whether a `dbmd fm set` / `--fm` value string is a **list of
2186/// wiki-links** that should be stored as a YAML block sequence, returning the
2187/// canonical `Value::Sequence` of quoted-link strings when so.
2188///
2189/// The value path of every write surface stringifies its argument; without this
2190/// a required list-of-links field (`meeting.attendees`) was unwritable in valid
2191/// form — passing `[[[a]], [[b]]]` stored a single scalar string that mis-parses
2192/// and trips `WIKI_LINK_FLOW_FORM_LIST` / `WIKI_LINK_BROKEN`. This recognizes the
2193/// two list spellings an agent naturally types and normalizes both to the block
2194/// form the canonical writer emits and `dbmd validate` accepts:
2195///
2196/// - flow list of quoted links — `["[[a]]", "[[b]]"]`
2197/// - flow list of unquoted links — `[[[a]], [[b]]]` (YAML: `Seq[Seq[String], …]`)
2198///
2199/// Returns `None` (⇒ caller stores a verbatim scalar string) for everything that
2200/// is not unambiguously a list of clean wiki-links — plain text, a single inline
2201/// `[[x]]` (YAML reads it as a one-item `Seq[Seq[String]]`, kept scalar so it
2202/// renders inline), an empty list, or a list with any non-link item. A single
2203/// link must stay scalar; only genuine multi-item-or-explicit lists become
2204/// sequences, matching `links_in_field_value`'s acceptance rule so writer and
2205/// validator never disagree.
2206fn parse_link_list_value(value: &str) -> Option<Value> {
2207    let trimmed = value.trim();
2208    // Only a YAML *flow sequence* literal is a list candidate; anything not
2209    // wrapped in `[ … ]` is a scalar (a bare `[[x]]` is wrapped, and handled by
2210    // the single-inline-link guard below).
2211    if !(trimmed.starts_with('[') && trimmed.ends_with(']')) {
2212        return None;
2213    }
2214    let Ok(Value::Sequence(items)) = serde_norway::from_str::<Value>(trimmed) else {
2215        return None;
2216    };
2217    // A single inline `[[x]]` parses to `Seq[ Seq[String(x)] ]` (one item, itself
2218    // a sequence) — that is the unquoted *scalar* form, not a list. Keep it scalar
2219    // so it round-trips to the inline `field: [[x]]` rather than a one-item block
2220    // list. `links_in_field_value` reads it back as a scalar link either way.
2221    if items.len() == 1 && unquoted_inline_link(&items[0]).is_some() {
2222        return None;
2223    }
2224    // Every item must resolve to exactly one clean wiki-link, in any of the flow
2225    // spellings an agent types (see [`link_from_flow_list_item`]).
2226    let mut links = Vec::with_capacity(items.len());
2227    for item in &items {
2228        links.push(link_from_flow_list_item(item)?);
2229    }
2230    if links.is_empty() {
2231        return None;
2232    }
2233    // Normalize to a block sequence of quoted-link strings — the form `to_yaml`
2234    // renders block-style and `links_in_field_value` accepts. `|display` is
2235    // preserved.
2236    let normalized = links
2237        .iter()
2238        .map(|l| Value::String(wiki_link_literal(l)))
2239        .collect();
2240    Some(Value::Sequence(normalized))
2241}
2242
2243/// Recognize one clean wiki-link from a single **item** of a YAML flow sequence,
2244/// across the spellings an agent types for a list. After top-level flow parsing,
2245/// a list item arrives in one of:
2246///
2247/// - quoted — `"[[x]]"` ⇒ `String("[[x]]")`
2248/// - unquoted in a flow list — `[[x]]` inside `[…]` ⇒ `Seq[ Seq[String(x)] ]`
2249///   (one level deeper than a bare unquoted scalar, because the surrounding list
2250///   adds a wrapper); unwrap the single-element wrapper, then read the inline
2251///   `Seq[String(x)]` with [`unquoted_inline_link`].
2252///
2253/// Returns `None` for any item that is not exactly one clean wiki-link, so the
2254/// caller falls back to a scalar string and never fabricates a partial list.
2255fn link_from_flow_list_item(item: &Value) -> Option<WikiLink> {
2256    match item {
2257        Value::String(s) => parse_wiki_link_str(s),
2258        Value::Sequence(inner) => {
2259            // Unquoted list item `[[x]]` → `Seq[ Seq[String(x)] ]`: peel the lone
2260            // wrapper to expose the inline-link shape `Seq[String(x)]`.
2261            //
2262            // Only this triple-nested shape is a wiki-link. We deliberately do
2263            // NOT fall back to `unquoted_inline_link(item)` on the bare double
2264            // nesting `Seq[String(x)]` (a plain one-element string list `[x]`):
2265            // that fallback fabricated a wiki-link out of an ordinary nested
2266            // string list — `groups: [[alpha], [beta]]` (data `[["alpha"],
2267            // ["beta"]]`) was rewritten to `- '[[alpha]]'` / `- '[[beta]]'`,
2268            // silently changing the field's type and manufacturing short-form
2269            // links the tool then flags as `WIKI_LINK_SHORT_FORM`. An unknown
2270            // nested string list must pass through verbatim (SPEC § "Unknown
2271            // fields pass through").
2272            if inner.len() == 1 {
2273                if let Some(link) = unquoted_inline_link(&inner[0]) {
2274                    return Some(link);
2275                }
2276            }
2277            None
2278        }
2279        _ => None,
2280    }
2281}
2282
2283/// A target is a full store-relative path when its first path segment is one of
2284/// the three canonical layer dirs and at least one `/` separator follows. A
2285/// trailing `.md` does not affect this classification.
2286fn target_is_full_path(target: &str) -> bool {
2287    let target = target.trim();
2288    match target.split_once('/') {
2289        Some((head, _rest)) => LAYER_DIRS.contains(&head),
2290        None => false,
2291    }
2292}
2293
2294/// True when the target carries a trailing `.md` extension (validate warns
2295/// `WIKI_LINK_HAS_EXTENSION`).
2296fn target_has_md_extension(target: &str) -> bool {
2297    target.trim().ends_with(".md")
2298}
2299
2300/// A forward-only cursor that yields the 1-based character (Unicode scalar)
2301/// column of successive byte offsets within a single line in ONE linear pass.
2302///
2303/// The previous helper recomputed `line[..offset].chars().count()` from the line
2304/// start for every match, so a line with N matches cost O(N × line_len) — a
2305/// quadratic blowup on a link-dense line. Because regex matches arrive in
2306/// non-decreasing byte order, this cursor advances the char count only across the
2307/// gap since the last queried offset, giving O(line_len) total per line.
2308///
2309/// Offsets MUST be queried in non-decreasing order and must fall on UTF-8
2310/// character boundaries (regex match starts always do).
2311struct ColCursor {
2312    byte: usize,
2313    chars: u32,
2314}
2315
2316impl ColCursor {
2317    fn new() -> Self {
2318        ColCursor { byte: 0, chars: 0 }
2319    }
2320
2321    /// 1-based character column of `byte_offset` in `line`. `byte_offset` must be
2322    /// `>=` every previously queried offset (debug-asserted).
2323    fn column_at(&mut self, line: &str, byte_offset: usize) -> u32 {
2324        debug_assert!(byte_offset >= self.byte, "ColCursor queried out of order");
2325        self.chars += line[self.byte..byte_offset].chars().count() as u32;
2326        self.byte = byte_offset;
2327        self.chars + 1
2328    }
2329}
2330
2331/// Index of the first comma-token in `raw[from..]` that *starts a greedy
2332/// modifier clause* (`enum`, `enum:…`, or `default …`), or `raw.len()` when none
2333/// remain. Used to bound a greedy `default`/`enum` value so it stops at the next
2334/// such clause instead of either truncating at the first comma or swallowing a
2335/// following greedy clause whole.
2336fn next_greedy_clause(raw: &[&str], from: usize) -> usize {
2337    let mut j = from;
2338    while j < raw.len() {
2339        let lower = raw[j].trim().to_ascii_lowercase();
2340        if lower == "enum" || lower.starts_with("enum:") || lower.starts_with("default ") {
2341            return j;
2342        }
2343        j += 1;
2344    }
2345    raw.len()
2346}
2347
2348/// Map a lowercase shape keyword to its [`Shape`].
2349fn shape_from_str(s: &str) -> Option<Shape> {
2350    match s {
2351        "string" => Some(Shape::String),
2352        "int" => Some(Shape::Int),
2353        "bool" => Some(Shape::Bool),
2354        "date" => Some(Shape::Date),
2355        "email" => Some(Shape::Email),
2356        "currency" => Some(Shape::Currency),
2357        "url" => Some(Shape::Url),
2358        _ => None,
2359    }
2360}
2361
2362/// The ATX heading level of a line (number of leading `#`), or 0 if not a
2363/// heading. Up to three leading spaces (CommonMark), requires a space/tab (or
2364/// end-of-line) after the `#` run, caps the run at six.
2365fn heading_level(line: &str) -> u8 {
2366    let indent = line.len() - line.trim_start_matches(' ').len();
2367    if indent > 3 {
2368        return 0;
2369    }
2370    let rest = &line[indent..];
2371    let hashes = rest.len() - rest.trim_start_matches('#').len();
2372    if hashes == 0 || hashes > 6 {
2373        return 0;
2374    }
2375    let after = &rest[hashes..];
2376    if after.is_empty() || after.starts_with(' ') || after.starts_with('\t') {
2377        hashes as u8
2378    } else {
2379        0
2380    }
2381}
2382
2383/// The heading text after the `#` run, trimmed, with a trailing ATX *closing*
2384/// `#` sequence removed per CommonMark (`## Title ##` → `Title`).
2385///
2386/// CommonMark only treats a trailing run of `#` as a closing sequence when it is
2387/// **preceded by a space or tab** (or the content is empty). A `#` that abuts the
2388/// preceding word is literal heading text: `## C#` → `C#`, `## F#` → `F#`,
2389/// `## issue-123#` → `issue-123#`. The old unconditional `trim_end_matches('#')`
2390/// stripped those, corrupting `dbmd sections`/`outline` heading text and — via
2391/// `parse_db_md` using the heading verbatim as the schema type key — silently
2392/// binding a `### c#` schema to `type: c` instead of `type: c#`.
2393fn heading_text(line: &str, level: u8) -> String {
2394    let indent = line.len() - line.trim_start_matches(' ').len();
2395    let after_hashes = &line[indent + level as usize..];
2396    let trimmed = after_hashes.trim();
2397
2398    // Peel a trailing run of `#`. It is a closing sequence only if what precedes
2399    // it (within `trimmed`) is empty or ends in a space/tab; otherwise the `#`s
2400    // are literal content.
2401    let without_hashes = trimmed.trim_end_matches('#');
2402    if without_hashes.len() == trimmed.len() {
2403        // No trailing `#` at all.
2404        return trimmed.to_string();
2405    }
2406    if without_hashes.is_empty() || without_hashes.ends_with([' ', '\t']) {
2407        // A genuine closing sequence (`## Title ##`, `## ##`): drop it and the
2408        // whitespace before it.
2409        without_hashes.trim_end().to_string()
2410    } else {
2411        // The `#` run abuts content (`## C#`): keep it as literal heading text.
2412        trimmed.to_string()
2413    }
2414}
2415
2416/// If `line` opens a fenced code block, return `(fence byte, run length)`.
2417fn opening_fence(line: &str) -> Option<(u8, usize)> {
2418    let indent = line.len() - line.trim_start_matches(' ').len();
2419    if indent > 3 {
2420        return None;
2421    }
2422    let rest = &line[indent..];
2423    let byte = rest.bytes().next()?;
2424    if byte != b'`' && byte != b'~' {
2425        return None;
2426    }
2427    let run = rest.len() - rest.trim_start_matches(byte as char).len();
2428    if run < 3 {
2429        return None;
2430    }
2431    // A backtick fence's info string may not itself contain a backtick.
2432    if byte == b'`' && rest[run..].contains('`') {
2433        return None;
2434    }
2435    Some((byte, run))
2436}
2437
2438/// True if `line` closes the currently open fence: same char, run at least as
2439/// long, nothing but trailing whitespace after.
2440fn is_closing_fence(line: &str, fence: (u8, usize)) -> bool {
2441    let (byte, open_len) = fence;
2442    let indent = line.len() - line.trim_start_matches(' ').len();
2443    if indent > 3 {
2444        return false;
2445    }
2446    let rest = &line[indent..];
2447    let run = rest.len() - rest.trim_start_matches(byte as char).len();
2448    if run < open_len {
2449        return false;
2450    }
2451    rest[run..].trim().is_empty()
2452}
2453
2454/// The prose body of a section: everything after the heading line, trimmed.
2455fn section_prose(section_body: &str) -> String {
2456    match section_body.split_once('\n') {
2457        Some((_heading, rest)) => rest.trim().to_string(),
2458        None => String::new(),
2459    }
2460}
2461
2462/// The bullet lines (`-`/`*`/`+`) of a section body, excluding the heading
2463/// line, each returned with its leading whitespace trimmed.
2464fn bullet_lines(section_body: &str) -> Vec<String> {
2465    section_body
2466        .lines()
2467        .skip(1) // the heading line
2468        .map(str::trim)
2469        .filter(|l| l.starts_with("- ") || l.starts_with("* ") || l.starts_with("+ "))
2470        .map(|l| l.to_string())
2471        .collect()
2472}
2473
2474/// Cut a bullet's content at the first comment separator, returning only the
2475/// meaningful prefix. Recognizes the em-dash (` — `), en-dash (` – `), double-
2476/// hyphen (` -- `), and the plain single-ASCII-hyphen (` - `) spellings an
2477/// operator naturally types — without the single-hyphen form, a comment like
2478/// `records/decisions/q3.md - finalized` left the whole line (comment included)
2479/// as the frozen path, so the entry never matched and the freeze failed OPEN.
2480/// A store-relative path never contains a ` - ` (paths are `/`-joined, spaceless),
2481/// so this does not truncate legitimate path text.
2482fn strip_bullet_comment(content: &str) -> &str {
2483    let mut cut = content.len();
2484    for sep in [" — ", " -- ", " – ", " - "] {
2485        if let Some(idx) = content.find(sep) {
2486            cut = cut.min(idx);
2487        }
2488    }
2489    content[..cut].trim()
2490}
2491
2492/// Strip the leading bullet marker, returning the trimmed content after it.
2493fn bullet_content(bullet: &str) -> &str {
2494    let t = bullet.trim();
2495    t.strip_prefix("- ")
2496        .or_else(|| t.strip_prefix("* "))
2497        .or_else(|| t.strip_prefix("+ "))
2498        .unwrap_or(t)
2499        .trim()
2500}
2501
2502/// Extract a store-relative path from a Frozen-pages bullet. The path may be
2503/// wrapped in backticks and followed by an em-dash comment.
2504fn extract_path_bullet(bullet: &str) -> String {
2505    let content = bullet_content(bullet);
2506    // Prefer a backtick-delimited span if present.
2507    if let Some(start) = content.find('`') {
2508        if let Some(end_rel) = content[start + 1..].find('`') {
2509            return content[start + 1..start + 1 + end_rel].trim().to_string();
2510        }
2511    }
2512    // Otherwise take the text up to a comment separator, stripping quotes.
2513    strip_bullet_comment(content)
2514        .trim_matches('"')
2515        .trim_matches('\'')
2516        .trim()
2517        .to_string()
2518}
2519
2520/// Extract a comma-separated type list from an Ignored-types bullet, stripping
2521/// backticks/quotes and any trailing em-dash comment.
2522fn extract_type_list_bullet(bullet: &str) -> Vec<String> {
2523    let content = strip_bullet_comment(bullet_content(bullet));
2524    content
2525        .split(',')
2526        .map(|t| {
2527            t.trim()
2528                .trim_matches('`')
2529                .trim_matches('"')
2530                .trim_matches('\'')
2531                .trim()
2532                .to_string()
2533        })
2534        .filter(|t| !t.is_empty())
2535        .collect()
2536}
2537
2538#[cfg(test)]
2539mod tests {
2540    use super::*;
2541    use std::path::Path;
2542    use tempfile::tempdir;
2543
2544    // ── Config::frozen_match (the single write-surface policy matcher) ───────
2545
2546    #[test]
2547    fn frozen_match_is_md_insensitive_both_directions() {
2548        // A policy entry stored WITHOUT `.md` (the natural extensionless
2549        // spelling `parse_db_md` keeps verbatim) must still match a `.md`
2550        // write target — the regression every write surface had.
2551        let cfg = Config {
2552            frozen_pages: vec![PathBuf::from("records/decisions/q1")],
2553            ..Config::default()
2554        };
2555        assert_eq!(
2556            cfg.frozen_match(Path::new("records/decisions/q1.md")),
2557            Some(PathBuf::from("records/decisions/q1")),
2558            "extensionless policy entry must freeze the .md file"
2559        );
2560        assert!(cfg.is_frozen(Path::new("records/decisions/q1.md")));
2561
2562        // The symmetric case: a policy entry WITH `.md` matches a bare target.
2563        let cfg = Config {
2564            frozen_pages: vec![PathBuf::from("records/decisions/q1.md")],
2565            ..Config::default()
2566        };
2567        assert_eq!(
2568            cfg.frozen_match(Path::new("records/decisions/q1")),
2569            Some(PathBuf::from("records/decisions/q1.md")),
2570        );
2571        // And the same-spelling cases still match.
2572        assert!(cfg.is_frozen(Path::new("records/decisions/q1.md")));
2573    }
2574
2575    #[test]
2576    fn frozen_match_drops_leading_dot_slash() {
2577        let cfg = Config {
2578            frozen_pages: vec![PathBuf::from("records/decisions/q1.md")],
2579            ..Config::default()
2580        };
2581        assert!(cfg.is_frozen(Path::new("./records/decisions/q1.md")));
2582        assert!(cfg.is_frozen(Path::new("./records/decisions/q1")));
2583    }
2584
2585    #[test]
2586    fn frozen_match_returns_none_for_unlisted_and_prefix_paths() {
2587        let cfg = Config {
2588            frozen_pages: vec![PathBuf::from("records/decisions/q1")],
2589            ..Config::default()
2590        };
2591        assert!(cfg
2592            .frozen_match(Path::new("records/decisions/q2.md"))
2593            .is_none());
2594        // A prefix is not a match: `q1` must not freeze `q1-draft`.
2595        assert!(cfg
2596            .frozen_match(Path::new("records/decisions/q1-draft.md"))
2597            .is_none());
2598        assert!(!cfg.is_frozen(Path::new("records/decisions/q11.md")));
2599    }
2600
2601    // ── split_frontmatter ───────────────────────────────────────────────────
2602
2603    #[test]
2604    fn split_frontmatter_separates_yaml_and_verbatim_body() {
2605        let text = "---\ntype: contact\nsummary: x\n---\n# Heading\n\nBody line.\n";
2606        let p = split_frontmatter(text, Path::new("f.md")).unwrap();
2607        assert_eq!(p.frontmatter_yaml, "type: contact\nsummary: x\n");
2608        // Body is everything after the closing fence's newline, byte-for-byte.
2609        assert_eq!(p.body, "# Heading\n\nBody line.\n");
2610    }
2611
2612    #[test]
2613    fn split_frontmatter_preserves_body_without_trailing_newline() {
2614        let text = "---\ntype: x\n---\nno trailing newline";
2615        let p = split_frontmatter(text, Path::new("f.md")).unwrap();
2616        assert_eq!(p.body, "no trailing newline");
2617    }
2618
2619    #[test]
2620    fn split_frontmatter_empty_body_when_nothing_after_fence() {
2621        let text = "---\ntype: x\n---\n";
2622        let p = split_frontmatter(text, Path::new("f.md")).unwrap();
2623        assert_eq!(p.body, "");
2624    }
2625
2626    #[test]
2627    fn split_frontmatter_missing_opening_fence_errors() {
2628        let text = "# No frontmatter here\ntype: x\n";
2629        let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
2630        assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
2631    }
2632
2633    #[test]
2634    fn split_frontmatter_leading_content_before_fence_rejected() {
2635        // The opening fence must be the very first line; a blank line first is
2636        // not allowed.
2637        let text = "\n---\ntype: x\n---\nbody";
2638        let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
2639        assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
2640    }
2641
2642    #[test]
2643    fn split_frontmatter_unterminated_block_errors() {
2644        let text = "---\ntype: x\nsummary: y\n";
2645        let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
2646        assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
2647    }
2648
2649    // ── Frontmatter::parse ───────────────────────────────────────────────────
2650
2651    #[test]
2652    fn parse_populates_typed_fields_and_routes_unknowns_to_extra() {
2653        let yaml = "type: contact\nid: sarah-chen\nsummary: Director of Ops\nstatus: active\ntags: [vip, renewal]\nemail: sarah@northstar.io\nrole: Director";
2654        let fm = Frontmatter::parse(yaml, Path::new("f.md")).unwrap();
2655        assert_eq!(fm.type_.as_deref(), Some("contact"));
2656        assert_eq!(fm.id.as_deref(), Some("sarah-chen"));
2657        assert_eq!(fm.summary.as_deref(), Some("Director of Ops"));
2658        assert_eq!(fm.status.as_deref(), Some("active"));
2659        assert_eq!(fm.tags, vec!["vip".to_string(), "renewal".to_string()]);
2660        // Type-specific fields are NOT promoted to typed slots.
2661        assert!(fm.type_.is_some() && !fm.extra.contains_key("type"));
2662        assert!(!fm.extra.contains_key("tags"));
2663        assert_eq!(
2664            fm.extra.get("email").and_then(|v| v.as_str()),
2665            Some("sarah@northstar.io")
2666        );
2667        assert_eq!(
2668            fm.extra.get("role").and_then(|v| v.as_str()),
2669            Some("Director")
2670        );
2671    }
2672
2673    #[test]
2674    fn parse_reads_rfc3339_timestamps() {
2675        let yaml =
2676            "type: email\ncreated: 2026-05-27T08:00:00-07:00\nupdated: 2026-05-28T09:30:00-07:00";
2677        let fm = Frontmatter::parse(yaml, Path::new("f.md")).unwrap();
2678        let created = fm.created.expect("created parsed");
2679        // -07:00 offset is 7 * 3600 seconds west.
2680        assert_eq!(created.offset().utc_minus_local(), 7 * 3600);
2681        assert_eq!(created.to_rfc3339(), "2026-05-27T08:00:00-07:00");
2682        assert!(fm.updated.is_some());
2683    }
2684
2685    #[test]
2686    fn parse_rejects_non_rfc3339_timestamp() {
2687        // A date-only value is not a full RFC3339 timestamp; created/updated
2688        // require the full form.
2689        let yaml = "type: email\ncreated: 2026-05-27";
2690        let err = Frontmatter::parse(yaml, Path::new("bad.md")).unwrap_err();
2691        match err {
2692            ParseError::BadTimestamp { key, value, .. } => {
2693                assert_eq!(key, "created");
2694                assert_eq!(value, "2026-05-27");
2695            }
2696            other => panic!("expected BadTimestamp, got {other:?}"),
2697        }
2698    }
2699
2700    #[test]
2701    fn parse_malformed_yaml_errors() {
2702        // Unclosed flow mapping is invalid YAML.
2703        let yaml = "type: contact\n  bad: : :\n- nope";
2704        let err = Frontmatter::parse(yaml, Path::new("bad.md")).unwrap_err();
2705        assert!(matches!(err, ParseError::MalformedYaml { .. }));
2706    }
2707
2708    #[test]
2709    fn frontmatter_with_yaml_tag_on_mapping_does_not_panic() {
2710        // Regression: a YAML tag on the top-level mapping made the old
2711        // `expect_err` path PANIC, because a tagged mapping deserializes to a
2712        // `Mapping` just fine. It must now be handled — accepted as the inner
2713        // mapping, never a panic.
2714        let fm = Frontmatter::parse("!mytag\ntype: contact\nsummary: hi\n", Path::new("x.md"))
2715            .expect("tagged-mapping frontmatter must parse, not panic");
2716        assert_eq!(fm.type_.as_deref(), Some("contact"));
2717        // A genuine scalar/sequence top level is still malformed (and still
2718        // doesn't panic).
2719        assert!(Frontmatter::parse("- a\n- b\n", Path::new("x.md")).is_err());
2720    }
2721
2722    #[test]
2723    fn parse_empty_block_is_empty_frontmatter() {
2724        let fm = Frontmatter::parse("", Path::new("f.md")).unwrap();
2725        assert_eq!(fm, Frontmatter::default());
2726    }
2727
2728    #[test]
2729    fn parse_scalar_top_level_is_malformed() {
2730        // A bare scalar at the top level is not a frontmatter mapping.
2731        let err = Frontmatter::parse("just a string", Path::new("f.md")).unwrap_err();
2732        assert!(matches!(err, ParseError::MalformedYaml { .. }));
2733    }
2734
2735    // ── to_yaml canonical order ──────────────────────────────────────────────
2736
2737    #[test]
2738    fn to_yaml_emits_canonical_key_order() {
2739        let mut fm = Frontmatter {
2740            type_: Some("contact".into()),
2741            id: Some("sarah-chen".into()),
2742            summary: Some("Director of Ops".into()),
2743            status: Some("active".into()),
2744            tags: vec!["vip".into()],
2745            created: Some(DateTime::parse_from_rfc3339("2026-05-27T08:00:00-07:00").unwrap()),
2746            updated: Some(DateTime::parse_from_rfc3339("2026-05-28T09:30:00-07:00").unwrap()),
2747            ..Default::default()
2748        };
2749        // Two type-specific fields, inserted in NON-alphabetical order to prove
2750        // the writer sorts them (BTreeMap) between the universal head and tail.
2751        fm.extra
2752            .insert("role".into(), Value::String("Director".into()));
2753        fm.extra.insert(
2754            "company".into(),
2755            Value::String("[[records/companies/northstar]]".into()),
2756        );
2757
2758        let yaml = fm.to_yaml();
2759        let keys: Vec<&str> = yaml
2760            .lines()
2761            .filter(|l| !l.starts_with(['-', ' ']) && l.contains(':'))
2762            .map(|l| l.split(':').next().unwrap())
2763            .collect();
2764        assert_eq!(
2765            keys,
2766            vec![
2767                "type", "id", "created", "updated", "summary", // universal head
2768                "company", "role",   // type-specific, sorted
2769                "status", // universal tail
2770                "tags",
2771            ],
2772            "canonical order violated; got:\n{yaml}"
2773        );
2774        // Timestamps round-trip as RFC3339 strings (YAML may quote them).
2775        assert!(
2776            yaml.contains("2026-05-27T08:00:00-07:00"),
2777            "created timestamp missing; got:\n{yaml}"
2778        );
2779        // The value re-parses to the same instant regardless of quoting.
2780        let reparsed = Frontmatter::parse(&yaml, Path::new("rt.md")).unwrap();
2781        assert_eq!(reparsed.created, fm.created);
2782        assert_eq!(reparsed.updated, fm.updated);
2783    }
2784
2785    /// Format v0.4: a minted-form (lowercase ULID) `id` round-trips verbatim
2786    /// through parse → to_yaml → parse and holds its canonical head slot —
2787    /// directly after `type` (and after `meta-type` when one is present),
2788    /// before `created`. Pins the emit order for the id-carrying record shape
2789    /// `dbmd write` produces.
2790    #[test]
2791    fn ulid_id_roundtrips_verbatim_in_head_position() {
2792        let ulid = "01j5qc3v9k4ym8rwbn2tqe6f7d";
2793        let yaml = format!(
2794            "type: profile\nmeta-type: conclusion\nid: {ulid}\ncreated: 2026-05-27T08:00:00-07:00\nupdated: 2026-05-27T08:00:00-07:00\nsummary: x\n"
2795        );
2796        let fm = Frontmatter::parse(&yaml, Path::new("rt.md")).unwrap();
2797        assert_eq!(
2798            fm.id.as_deref(),
2799            Some(ulid),
2800            "id must parse into the typed field"
2801        );
2802
2803        let emitted = fm.to_yaml();
2804        let keys: Vec<&str> = emitted
2805            .lines()
2806            .filter(|l| !l.starts_with(['-', ' ']) && l.contains(':'))
2807            .map(|l| l.split(':').next().unwrap())
2808            .collect();
2809        assert_eq!(
2810            keys,
2811            vec!["type", "meta-type", "id", "created", "updated", "summary"],
2812            "id must sit in the universal head; got:\n{emitted}"
2813        );
2814        assert!(
2815            emitted.contains(&format!("id: {ulid}")),
2816            "ULID must emit unquoted and verbatim; got:\n{emitted}"
2817        );
2818        let reparsed = Frontmatter::parse(&emitted, Path::new("rt.md")).unwrap();
2819        assert_eq!(reparsed.id.as_deref(), Some(ulid));
2820        assert_eq!(reparsed, fm, "round-trip must be lossless");
2821    }
2822
2823    #[test]
2824    fn to_yaml_omits_absent_optional_fields() {
2825        let fm = Frontmatter {
2826            type_: Some("note".into()),
2827            ..Default::default()
2828        };
2829        let yaml = fm.to_yaml();
2830        assert!(yaml.contains("type: note"));
2831        assert!(!yaml.contains("status"));
2832        assert!(!yaml.contains("tags"));
2833        assert!(!yaml.contains("summary"));
2834    }
2835
2836    // ── Regression: non-string scalar universal fields round-trip (finding #1) ─
2837
2838    #[test]
2839    fn regression_parse_preserves_non_string_scalar_universal_fields() {
2840        // A hand/externally-authored file whose universal fields are bare
2841        // scalars YAML reads as Number/Bool — `id: 100`, `summary: 2026`,
2842        // `status: 0`, `type: 42` — must be PRESERVED as their string form, not
2843        // read as None. Before the fix, `v.as_str()` returned None for these and
2844        // the matched arm discarded the value entirely (never reaching `extra`).
2845        let yaml = "type: 42\nid: 100\nsummary: 2026\nstatus: 0";
2846        let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
2847        assert_eq!(fm.type_.as_deref(), Some("42"), "type scalar dropped");
2848        assert_eq!(fm.id.as_deref(), Some("100"), "id scalar dropped");
2849        assert_eq!(
2850            fm.summary.as_deref(),
2851            Some("2026"),
2852            "summary scalar dropped"
2853        );
2854        assert_eq!(fm.status.as_deref(), Some("0"), "status scalar dropped");
2855        // The values must surface through the public `get` accessor too.
2856        assert_eq!(
2857            fm.get("summary")
2858                .and_then(|v| v.as_str().map(str::to_string)),
2859            Some("2026".to_string())
2860        );
2861    }
2862
2863    #[test]
2864    fn regression_format_round_trip_does_not_delete_numeric_frontmatter() {
2865        // The exact finding-#1 trigger: `dbmd format` is read_file -> write_file.
2866        // A file whose `id`/`summary`/`status` are bare numeric scalars must
2867        // still carry those fields after the canonical re-emit. Before the fix,
2868        // the lines were silently deleted from disk (only `type` survived).
2869        let dir = tempdir().unwrap();
2870        let path = dir.path().join("x.md");
2871        let original = "---\ntype: contact\nid: 100\nsummary: 2026\nstatus: 0\n---\nbody\n";
2872        std::fs::write(&path, original).unwrap();
2873
2874        // Re-emit through the canonical writer, exactly as `dbmd format` does.
2875        let (fm, body) = read_file(&path).unwrap();
2876        write_file(&path, &fm, &body).unwrap();
2877
2878        let after = std::fs::read_to_string(&path).unwrap();
2879        // None of the four fields may vanish; they survive as string scalars.
2880        let reparsed = Frontmatter::parse(
2881            &split_frontmatter(&after, &path).unwrap().frontmatter_yaml,
2882            &path,
2883        )
2884        .unwrap();
2885        assert_eq!(reparsed.type_.as_deref(), Some("contact"));
2886        assert_eq!(reparsed.id.as_deref(), Some("100"), "id deleted by format");
2887        assert_eq!(
2888            reparsed.summary.as_deref(),
2889            Some("2026"),
2890            "summary deleted by format"
2891        );
2892        assert_eq!(
2893            reparsed.status.as_deref(),
2894            Some("0"),
2895            "status deleted by format"
2896        );
2897        // The body is preserved verbatim.
2898        assert_eq!(body, "body\n");
2899    }
2900
2901    #[test]
2902    fn regression_format_round_trip_preserves_oversized_integer_frontmatter() {
2903        // Adversarial review #6: a bare integer literal beyond i64/u64 range must
2904        // survive `dbmd format` (read_file -> write_file) byte-for-byte. Before
2905        // the fix, serde_norway silently truncated `> u128::MAX` to f64 (`999…9`
2906        // -> `1e39`) and hard-rejected `(u64::MAX, u128::MAX]` — corrupting an
2907        // imported numeric ID and breaking the unknown-field round-trip contract.
2908        let dir = tempdir().unwrap();
2909        let path = dir.path().join("x.md");
2910        let big = "999999999999999999999999999999999999999"; // 39 digits, > u128::MAX
2911        let mid = "99999999999999999999"; // 20 digits, in (u64::MAX, u128::MAX]
2912        let original = format!(
2913            "---\ntype: contact\nsummary: x\naccount_number: {big}\nid_num: {mid}\n---\nbody\n"
2914        );
2915        std::fs::write(&path, &original).unwrap();
2916
2917        // Two round-trips: the value must survive verbatim AND be idempotent.
2918        for _ in 0..2 {
2919            let (fm, body) = read_file(&path).expect("oversized-int frontmatter must parse");
2920            write_file(&path, &fm, &body).unwrap();
2921            let after = std::fs::read_to_string(&path).unwrap();
2922            assert!(
2923                after.contains(big),
2924                "39-digit integer corrupted by format:\n{after}"
2925            );
2926            assert!(
2927                after.contains(mid),
2928                "20-digit integer corrupted by format:\n{after}"
2929            );
2930            assert!(
2931                !after.to_lowercase().contains("1e39"),
2932                "integer was truncated to a float:\n{after}"
2933            );
2934            assert_eq!(body, "body\n", "body must be preserved verbatim");
2935        }
2936    }
2937
2938    #[test]
2939    fn oversized_int_literal_detection_is_precise() {
2940        // In range (serde_norway handles losslessly) → never quoted.
2941        for ok in [
2942            "0",
2943            "42",
2944            "-17",
2945            "9223372036854775807",
2946            "18446744073709551615",
2947            "12.5",
2948            "007",
2949            "abc",
2950            "",
2951        ] {
2952            assert!(
2953                !is_oversized_int_literal(ok),
2954                "must NOT be flagged oversized: {ok:?}"
2955            );
2956        }
2957        // Beyond i64/u64 → quoted to preserve the literal.
2958        for big in [
2959            "18446744073709551616",                    // u64::MAX + 1
2960            "99999999999999999999",                    // 20 digits
2961            "999999999999999999999999999999999999999", // 39 digits
2962            "-9999999999999999999999",                 // very negative
2963        ] {
2964            assert!(
2965                is_oversized_int_literal(big),
2966                "must be flagged oversized: {big:?}"
2967            );
2968        }
2969    }
2970
2971    #[test]
2972    fn regression_oversized_int_in_flow_sequence_round_trips() {
2973        // The single-line flow SEQUENCE form regressed: an oversized int inside
2974        // `ids: [123…]` reached serde_norway un-quoted and hard-failed the whole
2975        // block as MalformedYaml (`as u128`), making every read surface
2976        // (format / fm get/set / link / validate) unable to read the file at all.
2977        // It must now parse, preserve the literal verbatim, and be idempotent.
2978        let dir = tempdir().unwrap();
2979        let path = dir.path().join("f.md");
2980        let big = "123456789012345678901234567890"; // 30 digits, > u128::MAX
2981        let original = format!("---\ntype: note\nsummary: x\nids: [{big}]\n---\nbody\n");
2982        std::fs::write(&path, &original).unwrap();
2983
2984        for _ in 0..2 {
2985            let (fm, body) = read_file(&path).expect("flow-sequence oversized int must parse");
2986            // The list value survives in `extra`, holding the literal as a string.
2987            let ids = fm.extra.get("ids").expect("ids field preserved");
2988            assert!(
2989                matches!(ids, Value::Sequence(_)),
2990                "ids should stay a sequence, got: {ids:?}"
2991            );
2992            write_file(&path, &fm, &body).unwrap();
2993            let after = std::fs::read_to_string(&path).unwrap();
2994            assert!(
2995                after.contains(big),
2996                "30-digit integer in flow sequence corrupted by format:\n{after}"
2997            );
2998            assert!(
2999                !after.to_lowercase().contains("1.234"),
3000                "integer was truncated to a float:\n{after}"
3001            );
3002            assert_eq!(body, "body\n", "body must be preserved verbatim");
3003        }
3004    }
3005
3006    #[test]
3007    fn regression_oversized_int_in_flow_mapping_round_trips() {
3008        // The single-line flow MAPPING form regressed identically:
3009        // `meta: {ext: 123…}` hard-failed the block. It must now parse and the
3010        // oversized value must survive verbatim.
3011        let dir = tempdir().unwrap();
3012        let path = dir.path().join("m.md");
3013        let big = "123456789012345678901234567890";
3014        let original = format!("---\ntype: note\nsummary: x\nmeta: {{ext: {big}}}\n---\nbody\n");
3015        std::fs::write(&path, &original).unwrap();
3016
3017        for _ in 0..2 {
3018            let (fm, body) = read_file(&path).expect("flow-mapping oversized int must parse");
3019            let meta = fm.extra.get("meta").expect("meta field preserved");
3020            assert!(
3021                matches!(meta, Value::Mapping(_)),
3022                "meta should stay a mapping, got: {meta:?}"
3023            );
3024            write_file(&path, &fm, &body).unwrap();
3025            let after = std::fs::read_to_string(&path).unwrap();
3026            assert!(
3027                after.contains(big),
3028                "oversized integer in flow mapping corrupted by format:\n{after}"
3029            );
3030            assert_eq!(body, "body\n", "body must be preserved verbatim");
3031        }
3032    }
3033
3034    #[test]
3035    fn regression_oversized_int_in_mixed_flow_collection_round_trips() {
3036        // A flow collection mixing an oversized int with an in-range int and a
3037        // string: only the oversized int is quoted; the in-range int stays a
3038        // number, the string stays a string, and the whole thing parses.
3039        let dir = tempdir().unwrap();
3040        let path = dir.path().join("mix.md");
3041        let big = "123456789012345678901234567890";
3042        let original = format!(
3043            "---\ntype: note\nsummary: x\nvals: [{big}, 42, hello, \"world\"]\n---\nbody\n"
3044        );
3045        std::fs::write(&path, &original).unwrap();
3046
3047        let (fm, body) = read_file(&path).expect("mixed flow collection must parse");
3048        let Value::Sequence(seq) = fm.extra.get("vals").expect("vals preserved") else {
3049            panic!("vals should be a sequence");
3050        };
3051        assert_eq!(seq.len(), 4, "all four entries preserved");
3052        // The oversized literal narrows to a string; the in-range int stays a
3053        // number; the bare and quoted strings stay strings.
3054        assert_eq!(seq[0].as_str(), Some(big), "oversized int -> string");
3055        assert_eq!(seq[1].as_i64(), Some(42), "in-range int stays a number");
3056        assert_eq!(seq[2].as_str(), Some("hello"));
3057        assert_eq!(seq[3].as_str(), Some("world"));
3058
3059        write_file(&path, &fm, &body).unwrap();
3060        let after = std::fs::read_to_string(&path).unwrap();
3061        assert!(after.contains(big), "oversized int lost:\n{after}");
3062        assert_eq!(body, "body\n");
3063    }
3064
3065    #[test]
3066    fn regression_multiple_oversized_ints_in_one_flow_line_round_trip() {
3067        // Two oversized literals on the same flow line — and a nested collection —
3068        // must each be quoted in the single left-to-right pass.
3069        let dir = tempdir().unwrap();
3070        let path = dir.path().join("multi.md");
3071        let a = "99999999999999999999"; // 20 digits
3072        let b = "123456789012345678901234567890"; // 30 digits
3073        let original =
3074            format!("---\ntype: note\nsummary: x\nm: {{a: {a}, nested: [{b}, 7]}}\n---\nbody\n");
3075        std::fs::write(&path, &original).unwrap();
3076
3077        let (fm, body) = read_file(&path).expect("multi oversized flow must parse");
3078        write_file(&path, &fm, &body).unwrap();
3079        let after = std::fs::read_to_string(&path).unwrap();
3080        assert!(after.contains(a), "first oversized int lost:\n{after}");
3081        assert!(after.contains(b), "second oversized int lost:\n{after}");
3082        assert_eq!(body, "body\n");
3083    }
3084
3085    #[test]
3086    fn regression_flow_with_only_in_range_and_strings_is_byte_exact() {
3087        // A flow collection with NO oversized int must round-trip byte-for-byte:
3088        // the pre-quoter must not touch in-range ints, strings, or floats. We
3089        // assert on the prepared-YAML stage so an unaffected line is left as the
3090        // borrowed input (no rewrite, no quoting drift).
3091        let yaml = "type: note\nids: [1, 2, 3]\nmeta: {ext: 42, name: bob}\nf: [1.5, 2.5]\n";
3092        let prepared = quote_oversized_integers(yaml);
3093        assert_eq!(
3094            prepared.as_ref(),
3095            yaml,
3096            "in-range flow collections must be left byte-exact"
3097        );
3098        // And it still parses cleanly with the expected numeric types intact.
3099        let fm = Frontmatter::parse(yaml, Path::new("n.md")).unwrap();
3100        let Value::Sequence(ids) = fm.extra.get("ids").unwrap() else {
3101            panic!("ids should be a sequence");
3102        };
3103        assert_eq!(ids[0].as_i64(), Some(1));
3104    }
3105
3106    #[test]
3107    fn quote_oversized_ints_in_flow_skips_quoted_and_digit_strings() {
3108        // A quoted scalar whose contents happen to be a long digit run must NOT
3109        // be re-quoted or otherwise altered — it is already a string. A flow with
3110        // only such strings yields no change (None).
3111        let flow = "[\"123456789012345678901234567890\", '99999999999999999999']";
3112        assert_eq!(
3113            quote_oversized_ints_in_flow(flow),
3114            None,
3115            "already-quoted digit strings must be left untouched"
3116        );
3117        // A bare oversized int alongside a quoted one: only the bare one is quoted.
3118        let flow2 = "[123456789012345678901234567890, \"already\"]";
3119        let out = quote_oversized_ints_in_flow(flow2).expect("bare int should be quoted");
3120        assert_eq!(out, "['123456789012345678901234567890', \"already\"]");
3121    }
3122
3123    // ── Regression: BOM-prefixed files parse like store/index (finding #19) ────
3124
3125    #[test]
3126    fn regression_split_frontmatter_tolerates_leading_utf8_bom() {
3127        // A BOM-prefixed file (EF BB BF + `---\n...`) is walked and indexed by
3128        // `dbmd index` (store/index strip the BOM) but, before the fix, every
3129        // write/edit surface routed through `read_file` hard-failed with
3130        // MissingFrontmatter. `split_frontmatter` must now strip a single leading
3131        // U+FEFF and emit a BOM-free body.
3132        let text = "\u{feff}---\ntype: note\nsummary: x\n---\nbody\n";
3133        let parsed = split_frontmatter(text, Path::new("note.md")).unwrap();
3134        assert_eq!(parsed.frontmatter_yaml, "type: note\nsummary: x\n");
3135        // Body never carries the BOM forward into the canonical writer.
3136        assert_eq!(parsed.body, "body\n");
3137        assert!(!parsed.body.starts_with('\u{feff}'));
3138    }
3139
3140    #[test]
3141    fn regression_read_file_parses_bom_prefixed_file() {
3142        // End-to-end through the same `read_file` path `dbmd fm get/set`,
3143        // `format`, `link`, and `write` use. Before the fix this returned
3144        // Err(MissingFrontmatter) on a file the catalog had already indexed.
3145        let dir = tempdir().unwrap();
3146        let path = dir.path().join("note.md");
3147        std::fs::write(&path, "\u{feff}---\ntype: note\nsummary: x\n---\nbody\n").unwrap();
3148
3149        let (fm, body) = read_file(&path).expect("BOM-prefixed file must parse");
3150        assert_eq!(fm.type_.as_deref(), Some("note"));
3151        assert_eq!(fm.summary.as_deref(), Some("x"));
3152        assert_eq!(body, "body\n");
3153    }
3154
3155    #[test]
3156    fn to_yaml_preserves_unquoted_scalar_wiki_link_round_trip() {
3157        // Regression (PRIMARY): the SPEC-canonical scalar wiki-link is the
3158        // *unquoted* inline `company: [[records/companies/northstar]]`
3159        // (SPEC § Linking, the worked `contact` example). YAML parses it to the
3160        // nested `Seq[Seq[String]]` shape. Before the fix, `to_yaml` re-emitted
3161        // it block-style as
3162        //     company:
3163        //     - - records/companies/northstar
3164        // — the `[[ ]]` brackets GONE — so a no-op re-emit (`dbmd format`, and
3165        // any `fm set` / `link` write) silently destroyed the link.
3166        let yaml = "type: contact\ncompany: [[records/companies/northstar]]";
3167        let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
3168        // Sanity: `parse` now disambiguates the inline-link source form at read
3169        // time (the genuine `Seq[Seq[String]]` of a 2D array no longer gets
3170        // collapsed at emit), so the inline link is stored as the canonical
3171        // scalar `String("[[x]]")`.
3172        assert_eq!(
3173            fm.extra.get("company").and_then(|v| v.as_str()),
3174            Some("[[records/companies/northstar]]")
3175        );
3176
3177        let out = fm.to_yaml();
3178        // The link must survive as a quoted inline scalar — brackets intact, and
3179        // never the bracket-less block sequence `- - records/...`.
3180        assert!(
3181            out.contains("[[records/companies/northstar]]"),
3182            "canonical writer dropped the wiki-link brackets; got:\n{out}"
3183        );
3184        assert!(
3185            !out.contains("- - "),
3186            "canonical writer emitted a nested block sequence (link corrupted); got:\n{out}"
3187        );
3188
3189        // And it round-trips: re-parsing the emitted YAML still surfaces exactly
3190        // one link with the right target (the edge graph/backlinks rely on).
3191        let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
3192        let fields = reparsed.link_fields();
3193        let links: Vec<(&str, &str, Option<&str>)> = fields
3194            .iter()
3195            .map(|(k, l)| (k.as_str(), l.target.as_str(), l.display.as_deref()))
3196            .collect();
3197        assert_eq!(
3198            links,
3199            vec![("company", "records/companies/northstar", None)]
3200        );
3201
3202        // A second re-emit is a fixed point — no progressive corruption across
3203        // repeated curator-loop writes.
3204        assert_eq!(
3205            reparsed.to_yaml(),
3206            out,
3207            "to_yaml is not idempotent on links"
3208        );
3209    }
3210
3211    #[test]
3212    fn to_yaml_preserves_unquoted_scalar_link_with_display() {
3213        // The `|display` segment must survive the unquoted-inline round-trip too.
3214        let yaml = "type: contact\ncompany: [[records/companies/northstar|Northstar]]";
3215        let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
3216        let out = fm.to_yaml();
3217        assert!(
3218            out.contains("[[records/companies/northstar|Northstar]]"),
3219            "display segment lost on round-trip; got:\n{out}"
3220        );
3221        let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
3222        let f = reparsed.link_fields();
3223        assert_eq!(f.len(), 1);
3224        assert_eq!(f[0].1.target, "records/companies/northstar");
3225        assert_eq!(f[0].1.display.as_deref(), Some("Northstar"));
3226    }
3227
3228    #[test]
3229    fn to_yaml_does_not_mangle_link_list_or_plain_nested_sequence() {
3230        // A genuine quoted block list of links round-trips as a clean string
3231        // list — never collapsed to a scalar — and a plain nested sequence that
3232        // is NOT a wiki-link is left exactly as written (no false conversion).
3233        let yaml = "type: meeting\nattendees:\n  - \"[[records/contacts/elena]]\"\n  - \"[[records/contacts/sarah]]\"\nmatrix:\n  - - 1\n    - 2";
3234        let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
3235        let out = fm.to_yaml();
3236
3237        // Both attendee links survive as quoted strings.
3238        assert!(out.contains("[[records/contacts/elena]]"), "got:\n{out}");
3239        assert!(out.contains("[[records/contacts/sarah]]"), "got:\n{out}");
3240
3241        let reparsed = Frontmatter::parse(&out, Path::new("m.md")).unwrap();
3242        let fields = reparsed.link_fields();
3243        let attendees: Vec<&str> = fields
3244            .iter()
3245            .filter(|(k, _)| k == "attendees")
3246            .map(|(_, l)| l.target.as_str())
3247            .collect();
3248        assert_eq!(
3249            attendees,
3250            vec!["records/contacts/elena", "records/contacts/sarah"]
3251        );
3252        // The non-link nested sequence is preserved verbatim, not touched.
3253        assert_eq!(reparsed.extra.get("matrix"), fm.extra.get("matrix"));
3254    }
3255
3256    // ── read_file / write_file round-trip ────────────────────────────────────
3257
3258    #[test]
3259    fn write_then_read_roundtrips_and_preserves_body_verbatim() {
3260        let dir = tempdir().unwrap();
3261        let path = dir.path().join("sources/emails/x.md");
3262        let body = "# Subject\n\nHello,\n\nSee [[records/contacts/sarah-chen]].\n";
3263        let mut fm = Frontmatter {
3264            type_: Some("email".into()),
3265            summary: Some("renewal note".into()),
3266            created: Some(DateTime::parse_from_rfc3339("2026-05-27T08:00:00-07:00").unwrap()),
3267            ..Default::default()
3268        };
3269        fm.extra
3270            .insert("from".into(), Value::String("elena@northstar.io".into()));
3271
3272        write_file(&path, &fm, body).unwrap();
3273
3274        let (read_fm, read_body) = read_file(&path).unwrap();
3275        assert_eq!(read_body, body, "body must be preserved byte-for-byte");
3276        assert_eq!(read_fm.type_.as_deref(), Some("email"));
3277        assert_eq!(read_fm.summary.as_deref(), Some("renewal note"));
3278        assert_eq!(
3279            read_fm.extra.get("from").and_then(|v| v.as_str()),
3280            Some("elena@northstar.io")
3281        );
3282        // The on-disk file starts with a fence and ends with the verbatim body.
3283        let raw = std::fs::read_to_string(&path).unwrap();
3284        assert!(raw.starts_with("---\n"));
3285        assert!(raw.ends_with(body));
3286    }
3287
3288    #[test]
3289    fn roundtrip_modify_summary_then_write_changes_only_summary() {
3290        let dir = tempdir().unwrap();
3291        let path = dir.path().join("records/contacts/sarah.md");
3292        let body = "Long-form operator notes about Sarah.\n";
3293        let fm = Frontmatter {
3294            type_: Some("contact".into()),
3295            summary: Some("old summary".into()),
3296            ..Default::default()
3297        };
3298        write_file(&path, &fm, body).unwrap();
3299
3300        // Read → modify summary → write back.
3301        let (mut fm2, body2) = read_file(&path).unwrap();
3302        fm2.summary = Some("new summary".into());
3303        write_file(&path, &fm2, &body2).unwrap();
3304
3305        let (fm3, body3) = read_file(&path).unwrap();
3306        assert_eq!(fm3.summary.as_deref(), Some("new summary"));
3307        assert_eq!(fm3.type_.as_deref(), Some("contact"));
3308        assert_eq!(body3, body, "body unchanged across the round-trip");
3309    }
3310
3311    #[test]
3312    fn roundtrip_preserves_handwritten_unquoted_scalar_wiki_link_on_disk() {
3313        // End-to-end analog of `dbmd format` on the verbatim SPEC worked example:
3314        // a hand-written file carrying the canonical UNQUOTED scalar link
3315        // `company: [[records/companies/northstar]]`, read from disk then written
3316        // back unchanged. Before the fix this no-op re-emit rewrote the on-disk
3317        // value to the bracket-less block sequence `company:\n- - records/...`,
3318        // and every reader (validate/graph/backlinks) then lost the edge.
3319        let dir = tempdir().unwrap();
3320        let path = dir.path().join("records/contacts/sarah-chen.md");
3321        let file = "---\ntype: contact\nid: sarah-chen\nsummary: Director of Ops\ncompany: [[records/companies/northstar]]\n---\n# Sarah Chen\n\nNotes.\n";
3322        std::fs::create_dir_all(path.parent().unwrap()).unwrap();
3323        std::fs::write(&path, file).unwrap();
3324
3325        // Read → write back unchanged (the canonical no-op re-emit).
3326        let (fm, body) = read_file(&path).unwrap();
3327        write_file(&path, &fm, &body).unwrap();
3328
3329        // On-disk bytes still carry the bracketed link, never `- - records/...`.
3330        let raw = std::fs::read_to_string(&path).unwrap();
3331        assert!(
3332            raw.contains("[[records/companies/northstar]]"),
3333            "on-disk wiki-link brackets were destroyed; got:\n{raw}"
3334        );
3335        assert!(
3336            !raw.contains("- - "),
3337            "on-disk value became a nested block sequence; got:\n{raw}"
3338        );
3339
3340        // And the edge is still readable after the round-trip.
3341        let (fm2, _) = read_file(&path).unwrap();
3342        let fields = fm2.link_fields();
3343        let links: Vec<(&str, &str)> = fields
3344            .iter()
3345            .map(|(k, l)| (k.as_str(), l.target.as_str()))
3346            .collect();
3347        assert_eq!(links, vec![("company", "records/companies/northstar")]);
3348    }
3349
3350    #[test]
3351    fn write_file_does_not_leave_temp_files_behind() {
3352        let dir = tempdir().unwrap();
3353        let path = dir.path().join("records/x.md");
3354        let fm = Frontmatter {
3355            type_: Some("note".into()),
3356            ..Default::default()
3357        };
3358        write_file(&path, &fm, "body\n").unwrap();
3359        // The directory should contain only the target file, no `.x.md.tmp.*`.
3360        let entries: Vec<String> = std::fs::read_dir(path.parent().unwrap())
3361            .unwrap()
3362            .map(|e| e.unwrap().file_name().to_string_lossy().into_owned())
3363            .collect();
3364        assert_eq!(entries, vec!["x.md".to_string()]);
3365    }
3366
3367    // ── is_content_file ──────────────────────────────────────────────────────
3368
3369    #[test]
3370    fn is_content_file_recognizes_layers_and_excludes_meta() {
3371        assert!(Frontmatter::is_content_file(Path::new(
3372            "sources/emails/2026-05-22.md"
3373        )));
3374        assert!(Frontmatter::is_content_file(Path::new(
3375            "records/contacts/sarah-chen.md"
3376        )));
3377        // A synthesis profile the agent authored lives under `records/` (the
3378        // old `wiki/` layer is gone, so a `wiki/...` path is NOT content).
3379        assert!(Frontmatter::is_content_file(Path::new(
3380            "records/profiles/sarah-chen.md"
3381        )));
3382        assert!(!Frontmatter::is_content_file(Path::new(
3383            "wiki/people/sarah-chen.md"
3384        )));
3385        // Absolute paths under a layer are still content.
3386        assert!(Frontmatter::is_content_file(Path::new(
3387            "/home/db/records/companies/northstar.md"
3388        )));
3389        // index.md at any level is meta.
3390        assert!(!Frontmatter::is_content_file(Path::new(
3391            "records/contacts/index.md"
3392        )));
3393        assert!(!Frontmatter::is_content_file(Path::new("index.md")));
3394        // Root meta files.
3395        assert!(!Frontmatter::is_content_file(Path::new("DB.md")));
3396        assert!(!Frontmatter::is_content_file(Path::new("log.md")));
3397    }
3398
3399    // ── effective_id ─────────────────────────────────────────────────────────
3400
3401    #[test]
3402    fn effective_id_prefers_explicit_then_derives_from_path() {
3403        let with_id = Frontmatter {
3404            id: Some("explicit-id".into()),
3405            ..Default::default()
3406        };
3407        assert_eq!(
3408            with_id.effective_id(Path::new("records/profiles/sarah-chen.md")),
3409            "explicit-id"
3410        );
3411        let no_id = Frontmatter::default();
3412        assert_eq!(
3413            no_id.effective_id(Path::new("records/profiles/sarah-chen.md")),
3414            "sarah-chen"
3415        );
3416    }
3417
3418    // ── get / set ────────────────────────────────────────────────────────────
3419
3420    #[test]
3421    fn set_routes_universal_and_custom_keys() {
3422        let mut fm = Frontmatter::default();
3423        fm.set("type", "contact").unwrap();
3424        fm.set("summary", "hi").unwrap();
3425        fm.set("company", "[[records/companies/northstar]]")
3426            .unwrap();
3427        assert_eq!(fm.type_.as_deref(), Some("contact"));
3428        assert_eq!(fm.summary.as_deref(), Some("hi"));
3429        // Custom key landed in extra, not a typed slot.
3430        assert_eq!(
3431            fm.extra.get("company").and_then(|v| v.as_str()),
3432            Some("[[records/companies/northstar]]")
3433        );
3434        // get reads from both typed fields and extra.
3435        assert_eq!(
3436            fm.get("type").and_then(|v| v.as_str().map(String::from)),
3437            Some("contact".into())
3438        );
3439        assert_eq!(
3440            fm.get("company").and_then(|v| v.as_str().map(String::from)),
3441            Some("[[records/companies/northstar]]".into())
3442        );
3443        assert!(fm.get("nonexistent").is_none());
3444    }
3445
3446    #[test]
3447    fn set_timestamp_validates_rfc3339() {
3448        let mut fm = Frontmatter::default();
3449        fm.set("created", "2026-05-27T08:00:00-07:00").unwrap();
3450        assert!(fm.created.is_some());
3451        let err = fm.set("updated", "not-a-date").unwrap_err();
3452        assert!(matches!(err, ParseError::BadTimestamp { .. }));
3453    }
3454
3455    // ── extract_wiki_links ───────────────────────────────────────────────────
3456
3457    #[test]
3458    fn extract_wiki_links_flags_full_path_short_form_and_extension() {
3459        let body = "See [[records/contacts/sarah-chen]] and [[sarah-chen]].\nAlso [[records/profiles/sarah-chen.md|Sarah]].\n";
3460        let links = extract_wiki_links(body, Path::new("doc.md"));
3461        assert_eq!(links.len(), 3);
3462
3463        // Full path, no extension, no display.
3464        assert_eq!(links[0].target, "records/contacts/sarah-chen");
3465        assert!(links[0].is_full_path);
3466        assert!(!links[0].has_md_extension);
3467        assert_eq!(links[0].display, None);
3468        assert_eq!(links[0].location.1, 1, "first link on line 1");
3469
3470        // Short form: not a full path.
3471        assert_eq!(links[1].target, "sarah-chen");
3472        assert!(!links[1].is_full_path, "bare target is short-form");
3473
3474        // Full path WITH .md extension and a display override on line 2.
3475        assert_eq!(links[2].target, "records/profiles/sarah-chen.md");
3476        assert!(links[2].is_full_path);
3477        assert!(links[2].has_md_extension);
3478        assert_eq!(links[2].display.as_deref(), Some("Sarah"));
3479        assert_eq!(links[2].location.1, 2);
3480    }
3481
3482    #[test]
3483    fn extract_wiki_links_reports_1_based_column_counting_chars() {
3484        // A multi-byte prefix (é is 2 bytes) must not skew the char column.
3485        let body = "café [[records/x/y]]";
3486        let links = extract_wiki_links(body, Path::new("d.md"));
3487        assert_eq!(links.len(), 1);
3488        // "café " is 5 chars, so the `[[` starts at char column 6 (1-based).
3489        assert_eq!(links[0].location.2, 6);
3490    }
3491
3492    #[test]
3493    fn extract_wiki_links_columns_are_correct_for_multiple_links_on_one_line() {
3494        // Locks the single-pass column cursor (the O(n²)→O(n) fix): each `[[`
3495        // reports the right 1-based CHAR column even with multi-byte prefixes and
3496        // several links per line.
3497        let body = "café [[a]] · [[records/x/y]] end";
3498        let links = extract_wiki_links(body, Path::new("d.md"));
3499        assert_eq!(links.len(), 2);
3500        // "café " = 5 chars → first `[[` at col 6.
3501        assert_eq!(links[0].location.2, 6);
3502        // "café [[a]] · " = 5 + 5 (`[[a]]`) + 3 (` · `, `·` is 1 char) = 13 chars
3503        // → second `[[` at col 14.
3504        assert_eq!(links[1].location.2, 14);
3505    }
3506
3507    #[test]
3508    fn extract_wiki_links_ignores_a_lone_path_without_brackets() {
3509        let links = extract_wiki_links(
3510            "records/contacts/sarah-chen is not a link",
3511            Path::new("d.md"),
3512        );
3513        assert!(links.is_empty());
3514    }
3515
3516    // ── extract_markdown_links ───────────────────────────────────────────────
3517
3518    #[test]
3519    fn extract_markdown_links_captures_external_and_not_wiki_links() {
3520        let body =
3521            "See [the thread](https://x.com/a) and [[records/contacts/sarah-chen]] internally.\n";
3522        let md = extract_markdown_links(body, Path::new("d.md"));
3523        assert_eq!(
3524            md.len(),
3525            1,
3526            "wiki-link must not be captured as a markdown link"
3527        );
3528        assert_eq!(md[0].text, "the thread");
3529        assert_eq!(md[0].url, "https://x.com/a");
3530        assert_eq!(md[0].location.1, 1);
3531
3532        // And the wiki-link extractor must not pick up the markdown link.
3533        let wl = extract_wiki_links(body, Path::new("d.md"));
3534        assert_eq!(wl.len(), 1);
3535        assert_eq!(wl[0].target, "records/contacts/sarah-chen");
3536    }
3537
3538    // ── link_fields ──────────────────────────────────────────────────────────
3539
3540    #[test]
3541    fn link_fields_extracts_scalar_list_and_summary_links() {
3542        // The canonical list form quotes each item so YAML parses it as clean
3543        // strings; a scalar field may be quoted OR written in the canonical
3544        // unquoted inline form `company: [[x]]` (SPEC § Linking).
3545        let yaml = "type: meeting\nsummary: with [[records/contacts/elena]]\ncompany: \"[[records/companies/northstar]]\"\nattendees:\n  - \"[[records/contacts/elena]]\"\n  - \"[[records/contacts/sarah]]\"\nnotes: just plain text";
3546        let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
3547        // Sanity: company really did parse as a scalar string here.
3548        assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_some());
3549        let fields = fm.link_fields();
3550
3551        // company (scalar) once, with the right target.
3552        let company: Vec<&str> = fields
3553            .iter()
3554            .filter(|(k, _)| k == "company")
3555            .map(|(_, l)| l.target.as_str())
3556            .collect();
3557        assert_eq!(company, vec!["records/companies/northstar"]);
3558        // attendees (block list) twice.
3559        let attendees: Vec<&str> = fields
3560            .iter()
3561            .filter(|(k, _)| k == "attendees")
3562            .map(|(_, l)| l.target.as_str())
3563            .collect();
3564        assert_eq!(
3565            attendees,
3566            vec!["records/contacts/elena", "records/contacts/sarah"]
3567        );
3568        // summary link surfaced.
3569        assert_eq!(fields.iter().filter(|(k, _)| k == "summary").count(), 1);
3570        // Plain-text field is not a link.
3571        assert_eq!(fields.iter().filter(|(k, _)| k == "notes").count(), 0);
3572    }
3573
3574    #[test]
3575    fn link_fields_surfaces_canonical_unquoted_scalar_link() {
3576        // Regression: the canonical scalar wiki-link form is the *unquoted*
3577        // inline `company: [[records/companies/northstar]]` (SPEC § Linking).
3578        // YAML parses `[[x]]` as a flow-list-in-a-list (`Seq[Seq[String]]`), so
3579        // a naive `as_str()`-only walk drops it. link_fields() must still
3580        // surface exactly one link with the correct target.
3581        let yaml = "type: meeting\ncompany: [[records/companies/northstar]]";
3582        let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
3583        // Sanity: `parse` disambiguates the inline-link source form at read time,
3584        // storing it as the canonical scalar `String("[[x]]")` (so a genuine
3585        // `Seq[Seq[String]]` 2D array is never collapsed/retyped). link_fields()
3586        // reads either spelling back as the same link.
3587        assert_eq!(
3588            fm.extra.get("company").and_then(|v| v.as_str()),
3589            Some("[[records/companies/northstar]]")
3590        );
3591
3592        let fields = fm.link_fields();
3593        let links: Vec<(&str, &str, Option<&str>)> = fields
3594            .iter()
3595            .map(|(k, l)| (k.as_str(), l.target.as_str(), l.display.as_deref()))
3596            .collect();
3597        assert_eq!(
3598            links,
3599            vec![("company", "records/companies/northstar", None)]
3600        );
3601
3602        // The `|display` segment survives the unquoted inline form too.
3603        let fm2 = Frontmatter::parse(
3604            "type: meeting\ncompany: [[records/companies/northstar|Northstar]]",
3605            Path::new("m.md"),
3606        )
3607        .unwrap();
3608        let f2 = fm2.link_fields();
3609        assert_eq!(f2.len(), 1);
3610        assert_eq!(f2[0].0, "company");
3611        assert_eq!(f2[0].1.target, "records/companies/northstar");
3612        assert_eq!(f2[0].1.display.as_deref(), Some("Northstar"));
3613    }
3614
3615    #[test]
3616    fn link_fields_ignores_plain_one_item_flow_list() {
3617        // A plain one-item flow list `aliases: [foo]` parses to `Seq[String]`
3618        // — one nesting level shallower than an unquoted `[[foo]]` — and must
3619        // NOT be mistaken for a wiki-link.
3620        let yaml = "type: contact\naliases: [foo]";
3621        let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
3622        assert_eq!(fm.link_fields(), Vec::new());
3623    }
3624
3625    // ── detect_flow_form_link_lists ──────────────────────────────────────────
3626
3627    #[test]
3628    fn detect_flow_form_flags_list_misencodings_not_scalars() {
3629        // The flow-form list mis-encoding (triple-nested) IS flagged; a scalar
3630        // inline wiki-link (double-nested) is NOT.
3631        let bad = "attendees: [[[records/x]], [[records/y]]]\nscalar_inline: [[records/z]]";
3632        let flagged = detect_flow_form_link_lists(bad);
3633        assert_eq!(flagged, vec!["attendees".to_string()]);
3634
3635        // An UNquoted block list is also a mis-encoding (parses triple-nested).
3636        let unquoted_block = "attendees:\n  - [[records/x]]\n  - [[records/y]]";
3637        assert_eq!(
3638            detect_flow_form_link_lists(unquoted_block),
3639            vec!["attendees".to_string()]
3640        );
3641
3642        // The canonical QUOTED block form parses to clean strings — NOT flagged.
3643        let good = "attendees:\n  - \"[[records/x]]\"\n  - \"[[records/y]]\"";
3644        assert!(detect_flow_form_link_lists(good).is_empty());
3645
3646        // A plain scalar list of strings is not flagged.
3647        let plain = "tags: [a, b, c]";
3648        assert!(detect_flow_form_link_lists(plain).is_empty());
3649    }
3650
3651    // ── extract_sections ─────────────────────────────────────────────────────
3652
3653    #[test]
3654    fn extract_sections_levels_nesting_and_boundaries() {
3655        let body = "intro text\n## First\nalpha\n### Sub\nbeta\n## Second\ngamma\n";
3656        let secs = extract_sections(body);
3657        let headings: Vec<(&str, u8)> =
3658            secs.iter().map(|s| (s.heading.as_str(), s.level)).collect();
3659        assert_eq!(headings, vec![("First", 2), ("Sub", 3), ("Second", 2)]);
3660
3661        // "First" (H2) body extends through its H3 child, stopping at "Second".
3662        let first = &secs[0];
3663        assert!(first.body.contains("alpha"));
3664        assert!(first.body.contains("### Sub"));
3665        assert!(first.body.contains("beta"));
3666        assert!(!first.body.contains("Second"));
3667
3668        // "Sub" (H3) stops at the next equal-or-shallower heading ("Second").
3669        let sub = &secs[1];
3670        assert!(sub.body.contains("beta"));
3671        assert!(!sub.body.contains("gamma"));
3672
3673        // 1-based line numbers within the body.
3674        assert_eq!(first.line, 2);
3675        assert_eq!(secs[2].line, 6);
3676    }
3677
3678    #[test]
3679    fn extract_sections_ignores_headings_in_fenced_code() {
3680        let body = "## Real\n```\n## Fake heading in code\n```\nafter\n";
3681        let secs = extract_sections(body);
3682        assert_eq!(secs.len(), 1);
3683        assert_eq!(secs[0].heading, "Real");
3684        // The fenced "## Fake" is part of Real's body, not its own section.
3685        assert!(secs[0].body.contains("## Fake heading in code"));
3686    }
3687
3688    // ── parse_field_spec ─────────────────────────────────────────────────────
3689
3690    #[test]
3691    fn parse_field_spec_required_and_shape() {
3692        let f = parse_field_spec("- email (required, email)");
3693        assert_eq!(f.name, "email");
3694        assert!(f.required);
3695        assert_eq!(f.shape, Some(Shape::Email));
3696        assert!(f.unknown_modifiers.is_empty());
3697    }
3698
3699    #[test]
3700    fn parse_field_spec_link_prefix_strips_trailing_slash() {
3701        let f = parse_field_spec("- company (required, link to records/companies/)");
3702        assert!(f.required);
3703        assert_eq!(f.link_prefix, Some(PathBuf::from("records/companies")));
3704        assert_eq!(f.shape, None);
3705    }
3706
3707    #[test]
3708    fn parse_field_spec_default_preserves_case_and_value() {
3709        let f = parse_field_spec("- currency (default USD)");
3710        assert_eq!(f.name, "currency");
3711        assert_eq!(f.default, Some(Value::String("USD".into())));
3712    }
3713
3714    #[test]
3715    fn parse_field_spec_enum_captures_comma_list_as_last_modifier() {
3716        let f = parse_field_spec("- status (required, enum: open, closed, pending)");
3717        assert!(f.required);
3718        assert_eq!(
3719            f.enum_values,
3720            Some(vec![
3721                "open".to_string(),
3722                "closed".to_string(),
3723                "pending".to_string()
3724            ])
3725        );
3726    }
3727
3728    #[test]
3729    fn parse_field_spec_bare_enum_keyword_is_not_itself_a_value() {
3730        // `enum` with no colon: the values are the remaining tokens; the keyword
3731        // itself must NOT leak in as an allowed value.
3732        let f = parse_field_spec("- status (required, enum, open, closed)");
3733        assert!(f.required);
3734        assert_eq!(
3735            f.enum_values,
3736            Some(vec!["open".to_string(), "closed".to_string()])
3737        );
3738    }
3739
3740    #[test]
3741    fn parse_field_spec_unknown_modifier_is_captured_not_errored() {
3742        let f = parse_field_spec("- weird (required, frobnicate, string)");
3743        assert!(f.required);
3744        assert_eq!(f.shape, Some(Shape::String));
3745        assert_eq!(f.unknown_modifiers, vec!["frobnicate".to_string()]);
3746    }
3747
3748    #[test]
3749    fn parse_field_spec_no_parens_is_freeform_optional() {
3750        let f = parse_field_spec("- nickname");
3751        assert_eq!(f.name, "nickname");
3752        assert!(!f.required);
3753        assert_eq!(f.shape, None);
3754        assert!(f.link_prefix.is_none());
3755        assert!(f.enum_values.is_none());
3756        assert!(f.unknown_modifiers.is_empty());
3757    }
3758
3759    // ── parse_schema_bullet (directives) ─────────────────────────────────────
3760
3761    #[test]
3762    fn schema_bullet_unique_single_field() {
3763        match parse_schema_bullet("- unique: email") {
3764            SchemaBullet::Unique(fields) => assert_eq!(fields, vec!["email".to_string()]),
3765            other => panic!("expected Unique, got {other:?}"),
3766        }
3767    }
3768
3769    #[test]
3770    fn schema_bullet_unique_compound_trims_and_splits() {
3771        match parse_schema_bullet("- unique: date, amount , vendor") {
3772            SchemaBullet::Unique(fields) => assert_eq!(
3773                fields,
3774                vec![
3775                    "date".to_string(),
3776                    "amount".to_string(),
3777                    "vendor".to_string()
3778                ]
3779            ),
3780            other => panic!("expected Unique, got {other:?}"),
3781        }
3782    }
3783
3784    #[test]
3785    fn schema_bullet_summary_template_keeps_braces_and_inner_colons() {
3786        match parse_schema_bullet("- summary_template: {role} at {company} (x: y)") {
3787            SchemaBullet::SummaryTemplate(t) => assert_eq!(t, "{role} at {company} (x: y)"),
3788            other => panic!("expected SummaryTemplate, got {other:?}"),
3789        }
3790    }
3791
3792    #[test]
3793    fn schema_bullet_field_with_enum_modifier_is_not_a_directive() {
3794        // A field whose modifiers contain a colon (`enum:`) parses as a field, not
3795        // a directive — its head has a `(` before any `:`.
3796        match parse_schema_bullet("- status (enum: open, closed)") {
3797            SchemaBullet::Field(f) => {
3798                assert_eq!(f.name, "status");
3799                assert_eq!(
3800                    f.enum_values,
3801                    Some(vec!["open".to_string(), "closed".to_string()])
3802                );
3803            }
3804            other => panic!("expected Field, got {other:?}"),
3805        }
3806    }
3807
3808    #[test]
3809    fn parse_db_md_schema_captures_unique_and_summary_template() {
3810        let db = "---\ntype: db-md\nscope: x\nowner: y\n---\n\n## Schemas\n\n### contact\n- email (required, email)\n- unique: email\n- summary_template: {role} at {company}\n";
3811        let config = parse_db_md(db, Path::new("DB.md")).unwrap();
3812        let s = config.schemas.get("contact").expect("contact schema");
3813        assert_eq!(s.fields.len(), 1, "directives are not parsed as fields");
3814        assert_eq!(s.unique_keys, vec![vec!["email".to_string()]]);
3815        assert_eq!(s.summary_template.as_deref(), Some("{role} at {company}"));
3816    }
3817
3818    #[test]
3819    fn schema_bullet_shard_directive_parses_values() {
3820        assert!(matches!(
3821            parse_schema_bullet("- shard: by-date"),
3822            SchemaBullet::Shard(Some(true))
3823        ));
3824        assert!(matches!(
3825            parse_schema_bullet("- shard: flat"),
3826            SchemaBullet::Shard(Some(false))
3827        ));
3828        // An unrecognized value is ignored (None), like an unknown modifier.
3829        assert!(matches!(
3830            parse_schema_bullet("- shard: weekly"),
3831            SchemaBullet::Shard(None)
3832        ));
3833        // A field whose name has a `(` before any `:` is still a field — the same
3834        // guard that keeps `- status (enum: a, b)` a field, not a directive.
3835        assert!(matches!(
3836            parse_schema_bullet("- shardiness (string)"),
3837            SchemaBullet::Field(_)
3838        ));
3839    }
3840
3841    #[test]
3842    fn parse_db_md_schema_captures_shard_directive() {
3843        let db = "---\ntype: db-md\nscope: x\nowner: y\n---\n\n## Schemas\n\n### shipment\n- carrier (string)\n- shard: by-date\n\n### contact\n- shard: flat\n";
3844        let config = parse_db_md(db, Path::new("DB.md")).unwrap();
3845        let shipment = config.schemas.get("shipment").expect("shipment schema");
3846        assert_eq!(shipment.shard, Some(true));
3847        assert_eq!(
3848            shipment.fields.len(),
3849            1,
3850            "`shard:` is a directive, not a field"
3851        );
3852        assert_eq!(config.schemas.get("contact").unwrap().shard, Some(false));
3853    }
3854
3855    // ── parse_db_md ──────────────────────────────────────────────────────────
3856
3857    const CANONICAL_DB_MD: &str = "---\ntype: db-md\nscope: company\nowner: Sarah Chen\n---\n\n# Acme operations knowledge base\n\nCompany-scale institutional memory for Acme.\n\n## Agent instructions\n\nPrioritize creating `contact` records from new-sender emails. Use British English.\n\n## Policies\n\n### Frozen pages\n- `records/decisions/2026-q1-strategy.md` — finalized, do not modify.\n- `records/synthesis/2026-annual-plan.md` — signed-off plan.\n\n### Ignored types\n- `test`, `temp` — read but never synthesize.\n\n## Schemas\n\n### contact\n- name (required)\n- email (required, email)\n- company (required, link to records/companies/)\n- role (string)\n\n### expense\n- date (required, date)\n- amount (required)\n- currency (default USD)\n";
3858
3859    #[test]
3860    fn parse_db_md_extracts_all_canonical_sections() {
3861        let config = parse_db_md(CANONICAL_DB_MD, Path::new("DB.md")).unwrap();
3862
3863        // Agent instructions: free-form prose, heading line stripped.
3864        let ai = config
3865            .agent_instructions
3866            .expect("agent instructions present");
3867        assert!(ai.starts_with("Prioritize creating"));
3868        assert!(!ai.contains("## Agent instructions"));
3869
3870        // Frozen pages: paths extracted from backticked bullets, comments dropped.
3871        assert_eq!(
3872            config.frozen_pages,
3873            vec![
3874                PathBuf::from("records/decisions/2026-q1-strategy.md"),
3875                PathBuf::from("records/synthesis/2026-annual-plan.md"),
3876            ]
3877        );
3878
3879        // Ignored types: comma list, backticks/comment stripped.
3880        assert_eq!(
3881            config.ignored_types,
3882            vec!["test".to_string(), "temp".to_string()]
3883        );
3884
3885        // Schemas: two types, each with its fields in source order.
3886        assert_eq!(config.schemas.len(), 2);
3887        let contact = config.schemas.get("contact").expect("contact schema");
3888        let names: Vec<&str> = contact.fields.iter().map(|f| f.name.as_str()).collect();
3889        assert_eq!(names, vec!["name", "email", "company", "role"]);
3890        assert!(contact.fields[0].required); // name
3891        assert_eq!(contact.fields[1].shape, Some(Shape::Email)); // email
3892        assert_eq!(
3893            contact.fields[2].link_prefix,
3894            Some(PathBuf::from("records/companies"))
3895        ); // company
3896
3897        let expense = config.schemas.get("expense").expect("expense schema");
3898        let cur = expense
3899            .fields
3900            .iter()
3901            .find(|f| f.name == "currency")
3902            .unwrap();
3903        assert_eq!(cur.default, Some(Value::String("USD".into())));
3904    }
3905
3906    #[test]
3907    fn parse_db_md_handles_malformed_and_unknown_modifiers() {
3908        // corpus-b shape: a `## Schemas` section with a malformed bullet, an
3909        // unknown modifier, and bullets that appear with NO `### <type>`
3910        // heading (so they belong to no schema and are dropped).
3911        let text = "---\ntype: db-md\n---\n\n## Schemas\n- orphan (required)\n\n### ticket\n- priority (required, mystery, enum: low, high)\n- broken (\n";
3912        let config = parse_db_md(text, Path::new("DB.md")).unwrap();
3913
3914        // The orphan bullet under `## Schemas` with no `### type` heading is not
3915        // captured as a schema.
3916        assert_eq!(config.schemas.len(), 1);
3917        let ticket = config.schemas.get("ticket").expect("ticket schema");
3918        assert_eq!(ticket.fields.len(), 2);
3919
3920        let priority = &ticket.fields[0];
3921        assert!(priority.required);
3922        assert_eq!(priority.unknown_modifiers, vec!["mystery".to_string()]);
3923        assert_eq!(
3924            priority.enum_values,
3925            Some(vec!["low".to_string(), "high".to_string()])
3926        );
3927
3928        // A bullet with an unclosed paren still yields a usable name.
3929        let broken = &ticket.fields[1];
3930        assert_eq!(broken.name, "broken");
3931    }
3932
3933    #[test]
3934    fn parse_db_md_missing_frontmatter_errors() {
3935        let text = "# No frontmatter\n\n## Agent instructions\nhi\n";
3936        let err = parse_db_md(text, Path::new("DB.md")).unwrap_err();
3937        assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
3938    }
3939
3940    #[test]
3941    fn parse_db_md_absent_sections_default_empty() {
3942        let text = "---\ntype: db-md\n---\n\n# Title only\n";
3943        let config = parse_db_md(text, Path::new("DB.md")).unwrap();
3944        assert_eq!(config, Config::default());
3945    }
3946
3947    // ── fm set / --fm list-valued link fields (meeting.attendees & friends) ──
3948
3949    /// `Frontmatter::set` is the value path every write surface (`fm set`,
3950    /// `write --fm`) funnels through. A list-of-wiki-links value (the SPEC's
3951    /// `meeting.attendees` shape) must serialize as a YAML **block sequence** of
3952    /// quoted links — readable back by [`links_in_field_value`] and accepted by
3953    /// `dbmd validate` — never the flow-form scalar string that trips
3954    /// `WIKI_LINK_FLOW_FORM_LIST`. Both the unquoted (`[[[a]], [[b]]]`) and
3955    /// quoted (`["[[a]]", "[[b]]"]`) spellings an agent types must normalize.
3956    #[test]
3957    fn set_list_of_wiki_links_becomes_block_sequence_both_spellings() {
3958        for value in [
3959            "[[[records/contacts/a]], [[records/contacts/b]]]",
3960            r#"["[[records/contacts/a]]", "[[records/contacts/b]]"]"#,
3961        ] {
3962            let mut fm = Frontmatter::default();
3963            fm.set("attendees", value).unwrap();
3964
3965            // Stored as a 2-element sequence of clean quoted links.
3966            let stored = fm.extra.get("attendees").expect("attendees set");
3967            let Value::Sequence(items) = stored else {
3968                panic!("attendees must be a Sequence, got {stored:?} for input {value}");
3969            };
3970            assert_eq!(items.len(), 2, "input {value}");
3971            assert_eq!(items[0], Value::String("[[records/contacts/a]]".into()));
3972            assert_eq!(items[1], Value::String("[[records/contacts/b]]".into()));
3973
3974            // The edge enumerator reads exactly the two links back (no stray
3975            // bracket targets, the flow-form-string symptom).
3976            let links: Vec<_> = links_in_field_value(stored)
3977                .into_iter()
3978                .map(|l| l.target)
3979                .collect();
3980            assert_eq!(
3981                links,
3982                vec!["records/contacts/a", "records/contacts/b"],
3983                "input {value}"
3984            );
3985
3986            // And the canonical writer renders it block-style, not as a scalar.
3987            let yaml = fm.to_yaml();
3988            assert!(
3989                yaml.contains("attendees:\n"),
3990                "expected block list in:\n{yaml}"
3991            );
3992            assert!(
3993                !yaml.contains("attendees: '[["),
3994                "must not be a flow-form scalar string in:\n{yaml}"
3995            );
3996        }
3997    }
3998
3999    /// A *single* inline wiki-link stays a scalar string (renders inline
4000    /// `field: [[x]]`), and a single link must never be widened to a one-item
4001    /// list — preserving the common `contact.company` / `expense.vendor` shape.
4002    #[test]
4003    fn set_single_inline_wiki_link_stays_scalar() {
4004        let mut fm = Frontmatter::default();
4005        fm.set("company", "[[records/companies/tideform]]").unwrap();
4006        assert_eq!(
4007            fm.extra.get("company"),
4008            Some(&Value::String("[[records/companies/tideform]]".into())),
4009        );
4010        // Still recognized as one link.
4011        let links: Vec<_> = links_in_field_value(fm.extra.get("company").unwrap())
4012            .into_iter()
4013            .map(|l| l.target)
4014            .collect();
4015        assert_eq!(links, vec!["records/companies/tideform"]);
4016    }
4017
4018    /// Plain text and a non-link flow list are left as verbatim scalar strings —
4019    /// the list normalization only triggers when every item is a clean wiki-link.
4020    #[test]
4021    fn set_non_link_values_stay_scalar_strings() {
4022        let mut fm = Frontmatter::default();
4023        fm.set("location", "Video call (remote)").unwrap();
4024        assert_eq!(
4025            fm.extra.get("location"),
4026            Some(&Value::String("Video call (remote)".into())),
4027        );
4028
4029        // A flow list whose items are NOT wiki-links must not be reinterpreted as
4030        // a link sequence; it stays the scalar string the agent passed.
4031        fm.set("note", "[draft, wip]").unwrap();
4032        assert_eq!(
4033            fm.extra.get("note"),
4034            Some(&Value::String("[draft, wip]".into()))
4035        );
4036    }
4037
4038    // ── Regression: non-string YAML keys round-trip (no Rust Debug corruption) ─
4039
4040    #[test]
4041    fn regression_non_string_yaml_keys_keep_their_text_on_round_trip() {
4042        // A numeric/bool/null/float frontmatter key is valid YAML and must NOT be
4043        // rewritten to its Rust `Debug` form (`Number(2026)`, `Bool(true)`,
4044        // `'Null'`). After the fix the key text survives (the key narrows to a
4045        // string-typed key, but the operator's data is no longer corrupted).
4046        let yaml = "type: note\n2026: planning notes\ntrue: yes-key\n3.14: f\n";
4047        let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
4048        // Keys are stored as their scalar text, not the Debug string.
4049        assert!(fm.extra.contains_key("2026"), "numeric key text lost");
4050        assert!(fm.extra.contains_key("true"), "bool key text lost");
4051        assert!(fm.extra.contains_key("3.14"), "float key text lost");
4052        assert!(!fm.extra.keys().any(|k| k.starts_with("Number(")));
4053        assert!(!fm.extra.keys().any(|k| k.starts_with("Bool(")));
4054
4055        // And a re-emit never produces the Debug forms on disk.
4056        let out = fm.to_yaml();
4057        assert!(!out.contains("Number("), "Debug-form key emitted:\n{out}");
4058        assert!(!out.contains("Bool("), "Debug-form key emitted:\n{out}");
4059        // The key text is still present (quoted, since it now reads as a string).
4060        assert!(out.contains("2026"), "numeric key dropped:\n{out}");
4061        assert!(out.contains("planning notes"), "value dropped:\n{out}");
4062    }
4063
4064    // ── Regression: universal-key sequence/mapping values are preserved (#2) ───
4065
4066    #[test]
4067    fn regression_universal_key_non_scalar_value_is_preserved_not_deleted() {
4068        // A universal key carrying a sequence/mapping (`status: [active, draft]`)
4069        // is not a valid scalar for that field. Before the fix, the matched arm
4070        // consumed-and-dropped it (scalar_string -> None) and `to_yaml` then
4071        // omitted the field — `dbmd format` silently DELETED it. It must now pass
4072        // through `extra` and re-emit verbatim.
4073        let yaml = "type: note\nstatus:\n  - active\n  - draft\nsummary:\n  a: 1\n  b: 2\n";
4074        let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
4075        // The typed accessors stay None (no valid scalar), but the data lives in
4076        // extra so nothing is lost.
4077        assert!(fm.status.is_none());
4078        assert!(fm.summary.is_none());
4079        assert!(fm.extra.contains_key("status"), "status value destroyed");
4080        assert!(fm.extra.contains_key("summary"), "summary value destroyed");
4081
4082        // A re-emit keeps both fields' data on disk.
4083        let out = fm.to_yaml();
4084        assert!(out.contains("status"), "status deleted on re-emit:\n{out}");
4085        assert!(out.contains("active"), "status items deleted:\n{out}");
4086        assert!(
4087            out.contains("summary"),
4088            "summary deleted on re-emit:\n{out}"
4089        );
4090
4091        // Round-trips as a fixed point — repeated curator-loop writes don't lose
4092        // the data.
4093        let reparsed = Frontmatter::parse(&out, Path::new("x.md")).unwrap();
4094        assert!(reparsed.extra.contains_key("status"));
4095        assert!(reparsed.extra.contains_key("summary"));
4096    }
4097
4098    // ── Regression: non-scalar tags items don't erase the tags field (#5) ──────
4099
4100    #[test]
4101    fn regression_non_scalar_tags_value_is_preserved_not_erased() {
4102        // `tags: [[vip]]` (an authoring slip — wiki-link brackets around a tag)
4103        // parses to a nested sequence; before the fix `parse_tags` filtered the
4104        // non-scalar item out and `to_yaml` then omitted the now-empty tags vec,
4105        // silently DELETING the tags line. It must now survive the re-emit (the
4106        // key data is preserved; the field is never dropped).
4107        let yaml = "type: note\ntags: [[vip]]\n";
4108        let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
4109        // The typed tags vec is empty (no clean scalar list), but the raw value
4110        // is preserved in extra so nothing is destroyed.
4111        assert!(fm.tags.is_empty());
4112        assert!(fm.extra.contains_key("tags"), "tags value destroyed");
4113
4114        let out = fm.to_yaml();
4115        assert!(out.contains("tags"), "tags deleted on re-emit:\n{out}");
4116        // The `vip` text survives on disk in some form (never erased).
4117        assert!(out.contains("vip"), "tag content erased:\n{out}");
4118
4119        // A clean tag list still parses to the typed vec (not regressed).
4120        let clean =
4121            Frontmatter::parse("type: note\ntags: [vip, renewal]\n", Path::new("x.md")).unwrap();
4122        assert_eq!(clean.tags, vec!["vip".to_string(), "renewal".to_string()]);
4123        assert!(!clean.extra.contains_key("tags"));
4124    }
4125
4126    // ── Regression: plain nested string lists are NOT fabricated into links (#3) ─
4127
4128    #[test]
4129    fn regression_plain_nested_string_list_is_not_turned_into_wiki_links() {
4130        // `groups: [[alpha], [beta]]` is the data [["alpha"],["beta"]] — an
4131        // unknown nested string list that must pass through verbatim. Before the
4132        // fix, canonicalize_extra_value fabricated `- '[[alpha]]'` / `- '[[beta]]'`
4133        // (short-form links the tool then flagged), changing the field's type.
4134        let yaml = "type: note\ngroups: [[alpha], [beta]]\n";
4135        let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
4136        let before = fm.extra.get("groups").cloned();
4137
4138        let out = fm.to_yaml();
4139        // No fabricated wiki-link brackets in the emitted YAML.
4140        assert!(!out.contains("[[alpha]]"), "fabricated a wiki-link:\n{out}");
4141        assert!(!out.contains("[[beta]]"), "fabricated a wiki-link:\n{out}");
4142
4143        // The value is unchanged across the canonical re-emit.
4144        let reparsed = Frontmatter::parse(&out, Path::new("x.md")).unwrap();
4145        assert_eq!(
4146            reparsed.extra.get("groups"),
4147            before.as_ref(),
4148            "nested string list mutated by canonicalize_extra_value"
4149        );
4150        // And it surfaces no links.
4151        assert!(reparsed.link_fields().is_empty());
4152    }
4153
4154    #[test]
4155    fn regression_genuine_nested_array_is_not_retyped_to_scalar_string() {
4156        // BUG: `dbmd format` silently retyped a genuine 2D array
4157        //     matrix:
4158        //     - - cell
4159        // (data `[["cell"]]`) into the scalar string `matrix: '[[cell]]'`. The
4160        // root cause is the irreducible YAML ambiguity: serde parses BOTH the
4161        // inline scalar wiki-link `field: [[x]]` AND the block nested-seq
4162        // `field:`\n`- - x` to the identical `Seq[Seq[String]]`. The old
4163        // `canonicalize_extra_value` collapsed every one-element `Seq[Seq[String]]`
4164        // to a string, destroying the array. The fix resolves the inline-link
4165        // case from the SOURCE text at parse time and leaves a genuine block
4166        // array verbatim.
4167        let yaml = "type: note\nsummary: nested\nmatrix:\n- - cell\n";
4168        let fm = Frontmatter::parse(yaml, Path::new("nested.md")).unwrap();
4169
4170        // The block source form stays a nested sequence, NOT a string — the
4171        // inline-link disambiguation only fires for source written `key: [[x]]`.
4172        let stored = fm.extra.get("matrix").expect("matrix preserved");
4173        assert!(
4174            matches!(stored, Value::Sequence(items)
4175                if items.len() == 1 && matches!(&items[0], Value::Sequence(_))),
4176            "genuine 2D array was retyped at parse time; got {stored:?}"
4177        );
4178
4179        let out = fm.to_yaml();
4180        // Emit must keep the array (a block nested sequence), never the bogus
4181        // scalar string `'[[cell]]'`.
4182        assert!(
4183            !out.contains("'[[cell]]'") && !out.contains("[[cell]]"),
4184            "genuine nested array retyped to a scalar wiki-link string; got:\n{out}"
4185        );
4186        assert!(
4187            out.contains("- - cell"),
4188            "nested array lost its 2D shape on emit; got:\n{out}"
4189        );
4190
4191        // Full round-trip: re-parsing the emitted YAML yields the identical value
4192        // — the file's bytes are preserved, which is what BUG 2 was about. (The
4193        // read-side `link_fields` still treats a one-element `Seq[Seq[String]]` as
4194        // the inline-link shape it is indistinguishable from on disk; that is the
4195        // same irreducible ambiguity and is out of scope here — the fix's job is
4196        // that `format` no longer silently RETYPES the array to a string.)
4197        let reparsed = Frontmatter::parse(&out, Path::new("nested.md")).unwrap();
4198        assert_eq!(
4199            reparsed.extra.get("matrix"),
4200            fm.extra.get("matrix"),
4201            "nested array did not round-trip through format"
4202        );
4203        // The stored value is still a sequence after round-trip (never a string).
4204        assert!(
4205            matches!(reparsed.extra.get("matrix"), Some(Value::Sequence(_))),
4206            "nested array became a non-sequence after round-trip"
4207        );
4208    }
4209
4210    #[test]
4211    fn inline_scalar_wiki_link_still_round_trips_after_nested_array_fix() {
4212        // The companion guarantee to the test above: the SPEC-canonical inline
4213        // scalar wiki-link `field: [[x]]` (SPEC.md:383) must still format to a
4214        // canonical inline `[[x]]` that round-trips and surfaces as one link —
4215        // the nested-array fix must not regress it.
4216        let yaml = "type: contact\ncompany: [[records/companies/northstar]]\n";
4217        let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
4218        // Disambiguated at parse time to the canonical scalar string.
4219        assert_eq!(
4220            fm.extra.get("company").and_then(|v| v.as_str()),
4221            Some("[[records/companies/northstar]]")
4222        );
4223
4224        let out = fm.to_yaml();
4225        assert!(
4226            out.contains("[[records/companies/northstar]]") && !out.contains("- - "),
4227            "inline wiki-link not canonical after the nested-array fix; got:\n{out}"
4228        );
4229
4230        let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
4231        let fields = reparsed.link_fields();
4232        let links: Vec<(&str, &str)> = fields
4233            .iter()
4234            .map(|(k, l)| (k.as_str(), l.target.as_str()))
4235            .collect();
4236        assert_eq!(links, vec![("company", "records/companies/northstar")]);
4237        // Idempotent across repeated curator-loop writes.
4238        assert_eq!(
4239            reparsed.to_yaml(),
4240            out,
4241            "inline link is not a format fixed point"
4242        );
4243    }
4244
4245    // ── Regression: fence-line trailing whitespace is tolerated (#4) ───────────
4246
4247    #[test]
4248    fn regression_split_frontmatter_tolerates_trailing_whitespace_on_fences() {
4249        // A fence written `--- ` (trailing space — invisible in editors) is
4250        // indexed/validated clean by index.rs/validate.rs (both use `trim_end()`)
4251        // but, before the fix, hard-failed every read/edit surface routed through
4252        // `split_frontmatter`. All three must now agree.
4253        let text = "--- \ntype: note\nsummary: x\n---\t\nbody\n";
4254        let parsed = split_frontmatter(text, Path::new("f.md")).unwrap();
4255        assert_eq!(parsed.frontmatter_yaml, "type: note\nsummary: x\n");
4256        assert_eq!(parsed.body, "body\n");
4257
4258        // End to end through read_file's parse.
4259        let fm = Frontmatter::parse(&parsed.frontmatter_yaml, Path::new("f.md")).unwrap();
4260        assert_eq!(fm.type_.as_deref(), Some("note"));
4261    }
4262
4263    // ── Regression: CommonMark trailing-'#' heading rule (#6) ──────────────────
4264
4265    #[test]
4266    fn regression_heading_text_keeps_abutting_hash_drops_closing_sequence() {
4267        // `## C#` → `C#` (the `#` abuts content, not a closing sequence).
4268        assert_eq!(heading_text("## C#", 2), "C#");
4269        assert_eq!(heading_text("## F#", 2), "F#");
4270        assert_eq!(heading_text("## issue-123#", 2), "issue-123#");
4271        // A genuine ATX closing sequence (space before the `#` run) is dropped.
4272        assert_eq!(heading_text("## Title ##", 2), "Title");
4273        assert_eq!(heading_text("## Title #", 2), "Title");
4274        // All-hashes content collapses to empty.
4275        assert_eq!(heading_text("## ##", 2), "");
4276        // No trailing hashes — unchanged.
4277        assert_eq!(heading_text("## Plain", 2), "Plain");
4278    }
4279
4280    #[test]
4281    fn regression_extract_sections_keeps_csharp_heading_and_schema_type_binds() {
4282        // `dbmd sections` must report `C#`, not `C`.
4283        let secs = extract_sections("## C#\nbody\n");
4284        assert_eq!(secs.len(), 1);
4285        assert_eq!(secs[0].heading, "C#");
4286
4287        // And a `### c#` schema must register under `c#`, not `c`.
4288        let db = "---\ntype: db-md\n---\n\n## Schemas\n\n### c#\n- name (required)\n";
4289        let config = parse_db_md(db, Path::new("DB.md")).unwrap();
4290        assert!(
4291            config.schemas.contains_key("c#"),
4292            "schema bound to wrong key"
4293        );
4294        assert!(!config.schemas.contains_key("c"));
4295    }
4296
4297    // ── Regression: section line numbers offset by the frontmatter block (#7) ──
4298
4299    #[test]
4300    fn regression_extract_sections_in_file_reports_source_line_numbers() {
4301        // A heading on file line 6 (after a 4-line frontmatter block + 1 body
4302        // line) must be reported as L6, not the body-relative L2.
4303        let text = "---\ntype: note\nsummary: x\n---\nbody line\n## Heading\nmore\n";
4304        let secs = extract_sections_in_file(text);
4305        assert_eq!(secs.len(), 1);
4306        assert_eq!(secs[0].heading, "Heading");
4307        assert_eq!(secs[0].line, 6, "section line not offset by frontmatter");
4308
4309        // The body-relative helper is unchanged (validate relies on that frame).
4310        let body_secs = extract_sections("body line\n## Heading\nmore\n");
4311        assert_eq!(body_secs[0].line, 2);
4312
4313        // No frontmatter: whole text is body, no offset.
4314        let plain = extract_sections_in_file("## Top\nx\n## Next\n");
4315        assert_eq!(plain[0].line, 1);
4316        assert_eq!(plain[1].line, 3);
4317    }
4318
4319    // ── Regression: colon-form schema field bullet parses modifiers (#8) ───────
4320
4321    #[test]
4322    fn regression_colon_form_field_bullet_parses_modifiers() {
4323        // `- title: string, required` is the natural mis-spelling of
4324        // `- title (string, required)`; before the fix the whole text became the
4325        // field name and every modifier was silently lost.
4326        let f = parse_field_spec("- title: string, required");
4327        assert_eq!(f.name, "title");
4328        assert!(f.required, "required modifier lost on colon-form");
4329        assert_eq!(f.shape, Some(Shape::String));
4330
4331        // Through the schema-bullet classifier (the real path), it is a Field.
4332        match parse_schema_bullet("- title: string, required") {
4333            SchemaBullet::Field(f) => {
4334                assert_eq!(f.name, "title");
4335                assert!(f.required);
4336                assert_eq!(f.shape, Some(Shape::String));
4337            }
4338            other => panic!("expected Field, got {other:?}"),
4339        }
4340
4341        // A paren form whose modifiers contain a colon still parses by parens.
4342        let g = parse_field_spec("- status (enum: open, closed)");
4343        assert_eq!(g.name, "status");
4344        assert_eq!(
4345            g.enum_values,
4346            Some(vec!["open".to_string(), "closed".to_string()])
4347        );
4348    }
4349
4350    // ── Regression: comma inside a `default` value is preserved (#9) ───────────
4351
4352    #[test]
4353    fn regression_default_value_preserves_internal_commas() {
4354        let f = parse_field_spec("- title (default Director, Operations)");
4355        assert_eq!(
4356            f.default,
4357            Some(Value::String("Director, Operations".into())),
4358            "comma-bearing default truncated"
4359        );
4360
4361        let g = parse_field_spec("- region (default North America, EMEA fallback)");
4362        assert_eq!(
4363            g.default,
4364            Some(Value::String("North America, EMEA fallback".into()))
4365        );
4366
4367        // A single-token default still works (no regression).
4368        let h = parse_field_spec("- currency (default USD)");
4369        assert_eq!(h.default, Some(Value::String("USD".into())));
4370    }
4371
4372    // ── Regression: a `default` after `enum` is parsed, not swallowed (#10) ────
4373
4374    #[test]
4375    fn regression_default_after_enum_is_parsed_not_an_enum_member() {
4376        let f = parse_field_spec("- status (enum: open, closed, default open)");
4377        assert_eq!(
4378            f.enum_values,
4379            Some(vec!["open".to_string(), "closed".to_string()]),
4380            "`default open` leaked into the enum list"
4381        );
4382        assert_eq!(
4383            f.default,
4384            Some(Value::String("open".into())),
4385            "default after enum was dropped"
4386        );
4387
4388        // The bare `enum` keyword form, with a trailing default.
4389        let g = parse_field_spec("- status (enum, open, closed, default open)");
4390        assert_eq!(
4391            g.enum_values,
4392            Some(vec!["open".to_string(), "closed".to_string()])
4393        );
4394        assert_eq!(g.default, Some(Value::String("open".into())));
4395    }
4396
4397    // ── Regression: frozen-page policy does not fail open (#11) ────────────────
4398
4399    #[test]
4400    fn regression_frozen_match_handles_leading_slash() {
4401        let cfg = Config {
4402            frozen_pages: vec![PathBuf::from("/records/decisions/q1.md")],
4403            ..Config::default()
4404        };
4405        assert!(
4406            cfg.is_frozen(Path::new("records/decisions/q1.md")),
4407            "leading-slash entry failed open"
4408        );
4409        assert!(cfg.is_frozen(Path::new("records/decisions/q1")));
4410    }
4411
4412    #[test]
4413    fn regression_frozen_match_supports_globs() {
4414        let cfg = Config {
4415            frozen_pages: vec![PathBuf::from("records/decisions/*")],
4416            ..Config::default()
4417        };
4418        assert!(
4419            cfg.is_frozen(Path::new("records/decisions/q1.md")),
4420            "glob entry failed to protect a concrete file"
4421        );
4422        assert!(cfg.is_frozen(Path::new("records/decisions/q2.md")));
4423        // The glob does not cross a `/` segment.
4424        assert!(!cfg.is_frozen(Path::new("records/decisions/sub/q1.md")));
4425        // `**` crosses segments.
4426        let deep = Config {
4427            frozen_pages: vec![PathBuf::from("records/**")],
4428            ..Config::default()
4429        };
4430        assert!(deep.is_frozen(Path::new("records/decisions/sub/q1.md")));
4431        assert!(deep.is_frozen(Path::new("records/x.md")));
4432        assert!(!deep.is_frozen(Path::new("sources/x.md")));
4433        // A `*.md`-style intra-segment glob.
4434        let suffix = Config {
4435            frozen_pages: vec![PathBuf::from("records/decisions/q*")],
4436            ..Config::default()
4437        };
4438        assert!(suffix.is_frozen(Path::new("records/decisions/q1.md")));
4439        assert!(!suffix.is_frozen(Path::new("records/decisions/draft.md")));
4440    }
4441
4442    #[test]
4443    fn regression_frozen_glob_many_double_stars_does_not_backtrack_exponentially() {
4444        use std::time::Instant;
4445
4446        // A DB.md frozen-page bullet with many consecutive `**` segments and a
4447        // literal tail (`zzz`), matched against a deep target that ends in a
4448        // DIFFERENT segment (`file.md`), is the catastrophic-backtracking case:
4449        // the old two-way `glob_segments` recursion explored an exponential
4450        // number of (star, path) splits before concluding "no match" — ~119s for
4451        // 15 stars — hanging the store's entire write path (every write/rename/
4452        // fm-set funnels through `frozen_match`). The two-pointer matcher + `**`
4453        // collapse make this polynomial.
4454        let pat = format!("{}/zzz", vec!["**"; 30].join("/"));
4455        let target_path = format!("records/{}/file.md", vec!["a"; 40].join("/"));
4456        let cfg = Config {
4457            frozen_pages: vec![PathBuf::from(&pat)],
4458            ..Config::default()
4459        };
4460
4461        let start = Instant::now();
4462        let frozen = cfg.is_frozen(Path::new(&target_path));
4463        let elapsed = start.elapsed();
4464
4465        // The tail `zzz` never matches the target's `file.md`, so it is NOT frozen…
4466        assert!(
4467            !frozen,
4468            "non-matching deep target wrongly reported frozen (semantics changed)"
4469        );
4470        // …and the decision must be near-instant, not exponential. The pre-fix
4471        // code took tens of seconds here; a generous ceiling still fails loudly
4472        // if the blow-up ever returns.
4473        assert!(
4474            elapsed.as_secs() < 1,
4475            "frozen glob took {elapsed:?} — catastrophic backtracking is back"
4476        );
4477
4478        // Semantics preserved: the same many-`**` pattern with a tail that DOES
4479        // match still freezes the file (a real match still refuses the write).
4480        let pat_hit = format!("{}/file.md", vec!["**"; 30].join("/"));
4481        let cfg_hit = Config {
4482            frozen_pages: vec![PathBuf::from(&pat_hit)],
4483            ..Config::default()
4484        };
4485        assert!(
4486            cfg_hit.is_frozen(Path::new(&target_path)),
4487            "many-`**` pattern failed to freeze a genuinely-matching deep target"
4488        );
4489    }
4490
4491    #[test]
4492    fn frozen_glob_double_star_collapse_preserves_match_set() {
4493        // Collapsing consecutive `**` must not change which paths match: `**/**`
4494        // matches exactly what `**` does. Interleaved `**` and literals still
4495        // match across segments, and a non-matching literal tail still fails.
4496        let collapsed = Config {
4497            frozen_pages: vec![PathBuf::from("records/**/**/**/q1.md")],
4498            ..Config::default()
4499        };
4500        assert!(collapsed.is_frozen(Path::new("records/decisions/q1.md")));
4501        assert!(collapsed.is_frozen(Path::new("records/a/b/c/q1.md")));
4502        assert!(collapsed.is_frozen(Path::new("records/q1.md")));
4503        assert!(!collapsed.is_frozen(Path::new("records/a/b/c/q2.md")));
4504        assert!(!collapsed.is_frozen(Path::new("sources/a/q1.md")));
4505
4506        // `**` between two literals spans zero or more intermediate segments.
4507        let between = Config {
4508            frozen_pages: vec![PathBuf::from("records/**/draft.md")],
4509            ..Config::default()
4510        };
4511        assert!(between.is_frozen(Path::new("records/draft.md")));
4512        assert!(between.is_frozen(Path::new("records/a/b/draft.md")));
4513        assert!(!between.is_frozen(Path::new("records/a/b/final.md")));
4514    }
4515
4516    #[test]
4517    fn regression_frozen_entry_single_hyphen_comment_is_stripped() {
4518        // `records/decisions/q3.md - finalized` (single ASCII hyphen comment, no
4519        // backticks): the comment must be stripped so the entry is just the path.
4520        let path = extract_path_bullet("- records/decisions/q3.md - finalized");
4521        assert_eq!(path, "records/decisions/q3.md");
4522
4523        // End to end: such a bullet freezes the file.
4524        let cfg = Config {
4525            frozen_pages: vec![PathBuf::from(extract_path_bullet(
4526                "- records/decisions/q3.md - finalized",
4527            ))],
4528            ..Config::default()
4529        };
4530        assert!(
4531            cfg.is_frozen(Path::new("records/decisions/q3.md")),
4532            "single-hyphen-comment entry failed open"
4533        );
4534    }
4535}
dbmd_core/parser.rs

dbmd_core/
parser.rs