dbmd_core/parser.rs
1//! `parser` — read and write db.md markdown files.
2//!
3//! Parses the YAML frontmatter block, the markdown body, wiki-links, standard
4//! markdown links, `##` sections, and the structured sections of the `DB.md`
5//! config file. Also the atomic writer that round-trips a file while
6//! preserving the operator-edited body verbatim and emitting frontmatter in
7//! canonical key order.
8//!
9//! Strict on required fields, lenient on unknowns: any frontmatter key the
10//! spec doesn't recognize is preserved in [`Frontmatter::extra`] as ambient
11//! context and round-tripped untouched.
12
13use std::collections::BTreeMap;
14use std::path::{Path, PathBuf};
15
16use chrono::{DateTime, FixedOffset};
17use serde_norway::{Mapping, Value};
18
19/// The three canonical layer folder names. A path is "content" / a wiki-link is
20/// "full-path" only when it resolves under one of these.
21const LAYER_DIRS: [&str; 3] = ["sources", "records", "wiki"];
22
23/// Errors produced while parsing a markdown file or the `DB.md` config.
24#[derive(Debug, thiserror::Error)]
25pub enum ParseError {
26 /// The frontmatter block was not valid YAML. Maps to validate code
27 /// `FM_MALFORMED_YAML`.
28 #[error("malformed YAML frontmatter in {file}: {source}")]
29 MalformedYaml {
30 /// The file whose frontmatter failed to parse.
31 file: PathBuf,
32 /// The underlying YAML error.
33 source: serde_norway::Error,
34 },
35
36 /// The file has no `---`-delimited frontmatter block at its very start.
37 #[error("missing frontmatter block in {file}")]
38 MissingFrontmatter {
39 /// The offending file.
40 file: PathBuf,
41 },
42
43 /// A required field was absent. Maps to validate code `FM_MISSING_TYPE`
44 /// (for `type`) and the per-type required-field codes.
45 #[error("missing required field '{key}' in {file}")]
46 MissingField {
47 /// The file missing the field.
48 file: PathBuf,
49 /// The required key.
50 key: String,
51 },
52
53 /// A timestamp field was not ISO-8601 / RFC3339. Maps to `FM_BAD_TIMESTAMP`.
54 #[error("bad timestamp in field '{key}' of {file}: {value}")]
55 BadTimestamp {
56 /// The file.
57 file: PathBuf,
58 /// The frontmatter key.
59 key: String,
60 /// The unparseable value.
61 value: String,
62 },
63
64 /// An I/O error reading the file.
65 #[error(transparent)]
66 Io(#[from] std::io::Error),
67}
68
69/// The parsed YAML frontmatter of a db.md file.
70///
71/// The universal-contract fields are typed accessors; everything else lands in
72/// [`extra`](Frontmatter::extra) as ambient context (unknown-field passthrough)
73/// and is round-tripped verbatim. The atomic writer re-emits keys in canonical
74/// order: `type`, `id`, `created`, `updated`, `summary` first, then
75/// type-specific fields, then `status` / `tags`.
76#[derive(Debug, Clone, Default, PartialEq)]
77pub struct Frontmatter {
78 /// `type` — required on content files; the primary query key.
79 pub type_: Option<String>,
80 /// `id` — optional; derived from the file path when absent.
81 pub id: Option<String>,
82 /// `created` — RFC3339; required and auto-set on content-file create.
83 pub created: Option<DateTime<FixedOffset>>,
84 /// `updated` — RFC3339; required and auto-maintained on content files.
85 pub updated: Option<DateTime<FixedOffset>>,
86 /// `summary` — the one-line catalog line; required on every content file.
87 pub summary: Option<String>,
88 /// `status` — optional lifecycle state.
89 pub status: Option<String>,
90 /// `tags` — optional flat list of short scalar labels.
91 pub tags: Vec<String>,
92 /// All other frontmatter keys (type-specific + custom), preserved verbatim
93 /// in insertion-stable sorted order. Wiki-link-valued fields keep their raw
94 /// YAML form here; [`Frontmatter::link_fields`] surfaces them as
95 /// [`WikiLink`]s.
96 pub extra: BTreeMap<String, Value>,
97}
98
99impl Frontmatter {
100 /// Parse a YAML frontmatter block (the text between the opening and closing
101 /// `---` fences, exclusive) into a [`Frontmatter`].
102 ///
103 /// Lenient on unknown keys (they go to [`extra`](Frontmatter::extra));
104 /// returns [`ParseError::MalformedYaml`] only on YAML that doesn't parse.
105 pub fn parse(yaml: &str, file: &Path) -> Result<Self, ParseError> {
106 // An empty (or whitespace-only) frontmatter block is a valid, empty
107 // mapping — not a YAML error.
108 let value: Value = if yaml.trim().is_empty() {
109 Value::Mapping(Mapping::new())
110 } else {
111 serde_norway::from_str(yaml).map_err(|source| ParseError::MalformedYaml {
112 file: file.to_path_buf(),
113 source,
114 })?
115 };
116
117 // Top-level frontmatter must be a mapping. A scalar or sequence at the
118 // top level is malformed for our purposes; surface it as such.
119 let map = match value {
120 Value::Mapping(m) => m,
121 Value::Null => Mapping::new(),
122 other => {
123 // serde_norway::Error has no public constructor, so let the
124 // deserializer decide: a value that coerces to a Mapping (e.g. a
125 // YAML-tagged mapping `!tag\n k: v`, where the tag is ambient) is
126 // accepted as that mapping; a genuine scalar or sequence top
127 // level fails to coerce and IS the malformed case. (Using a
128 // match here, not `expect_err`, avoids a panic on the
129 // tagged-mapping case, which deserializes to a Mapping just
130 // fine.)
131 match serde_norway::from_value::<Mapping>(other) {
132 Ok(m) => m,
133 Err(source) => {
134 return Err(ParseError::MalformedYaml {
135 file: file.to_path_buf(),
136 source,
137 });
138 }
139 }
140 }
141 };
142
143 let mut fm = Frontmatter::default();
144 for (k, v) in map {
145 let key = match k.as_str() {
146 Some(s) => s.to_string(),
147 // Non-string keys (`2026:`, `true:`, `3.14:`) are unusual but
148 // valid YAML; per SPEC § "Unknown fields pass through" they must
149 // not be corrupted on re-emit. Stringify them through the YAML
150 // scalar emitter — `2026`, `true`, `3.14` — NOT the Rust `Debug`
151 // formatter (which produced `Number(2026)`, `Bool(true)`, …), so
152 // the key text survives. `extra` is `String`-keyed, so on the
153 // write side the key re-emits as a quoted-string key carrying that
154 // text (e.g. `'2026':`) — the type narrows from number to string,
155 // but the data is no longer destroyed and ordinary string keys are
156 // wholly unaffected.
157 None => yaml_scalar_key(&k),
158 };
159 match key.as_str() {
160 // Coerce scalar values rather than `v.as_str()` (which is None
161 // for Number/Bool/Null). A bare scalar that YAML reads as a
162 // non-string — `summary: 2026`, `id: 100`, `status: 0` — would
163 // otherwise be set to None AND dropped (it is a matched arm, so
164 // the raw value never reaches `extra`), and `to_yaml` then omits
165 // the None field, so `dbmd format` (read_file -> write_file)
166 // silently deletes the line from disk. `scalar_string` mirrors
167 // the coercion `validate`/`store` already apply to these fields,
168 // so a numeric/bool-looking scalar is preserved as its string
169 // form and round-trips instead of being destroyed.
170 //
171 // A sequence/mapping value on a universal key (`status: [a, b]`,
172 // a nested-mapping `summary:`) is NOT a valid scalar; rather than
173 // let the matched arm consume-and-drop it (silent data loss on
174 // the next re-emit), `scalar_string` returns None and we fall
175 // through to preserving the raw value in `extra` so `to_yaml`
176 // re-emits it verbatim. The universal accessors stay None (the
177 // value was never a valid scalar for that field), but the
178 // operator's bytes are never destroyed.
179 "type" => match scalar_string(&v) {
180 Some(s) => fm.type_ = Some(s),
181 None => {
182 fm.extra.insert(key, v);
183 }
184 },
185 "id" => match scalar_string(&v) {
186 Some(s) => fm.id = Some(s),
187 None => {
188 fm.extra.insert(key, v);
189 }
190 },
191 "created" => fm.created = parse_timestamp(&v, "created", file)?,
192 "updated" => fm.updated = parse_timestamp(&v, "updated", file)?,
193 "summary" => match scalar_string(&v) {
194 Some(s) => fm.summary = Some(s),
195 None => {
196 fm.extra.insert(key, v);
197 }
198 },
199 "status" => match scalar_string(&v) {
200 Some(s) => fm.status = Some(s),
201 None => {
202 fm.extra.insert(key, v);
203 }
204 },
205 "tags" => match parse_tags_preserving(&v) {
206 Ok(tags) => fm.tags = tags,
207 // A `tags` value with a non-scalar item (`tags: [[vip]]`,
208 // `tags: [a, [b]]`) is preserved verbatim in `extra` rather
209 // than silently filtered down / erased on re-emit. The typed
210 // `tags` vec stays empty (no valid scalar list was present),
211 // so `to_yaml` won't ALSO emit a `tags:` from the vec.
212 Err(raw) => {
213 fm.extra.insert(key, raw);
214 }
215 },
216 _ => {
217 fm.extra.insert(key, v);
218 }
219 }
220 }
221 Ok(fm)
222 }
223
224 /// Serialize the frontmatter back to a YAML block (no `---` fences) in
225 /// canonical key order. Round-trips [`extra`](Frontmatter::extra) verbatim.
226 pub fn to_yaml(&self) -> String {
227 // Build an order-preserving mapping in canonical key order:
228 // type, id, created, updated, summary (universal head)
229 // <type-specific extra, BTreeMap-sorted>
230 // status, tags (universal tail)
231 // serde_norway::Mapping preserves insertion order, so one serialize call
232 // emits the block in exactly this order with correct YAML quoting.
233 let mut map = Mapping::new();
234
235 if let Some(t) = &self.type_ {
236 map.insert(Value::String("type".into()), Value::String(t.clone()));
237 }
238 if let Some(id) = &self.id {
239 map.insert(Value::String("id".into()), Value::String(id.clone()));
240 }
241 if let Some(created) = &self.created {
242 map.insert(
243 Value::String("created".into()),
244 Value::String(created.to_rfc3339()),
245 );
246 }
247 if let Some(updated) = &self.updated {
248 map.insert(
249 Value::String("updated".into()),
250 Value::String(updated.to_rfc3339()),
251 );
252 }
253 if let Some(summary) = &self.summary {
254 map.insert(
255 Value::String("summary".into()),
256 Value::String(summary.clone()),
257 );
258 }
259
260 // Type-specific + custom fields, in BTreeMap (sorted) order. Each value
261 // is canonicalized so a wiki-link round-trips to the form the writer and
262 // `dbmd validate` agree on — critically, the SPEC-canonical *unquoted*
263 // scalar `field: [[x]]` (which YAML parses to a nested `Seq[Seq[String]]`)
264 // is re-emitted as a quoted scalar `'[[x]]'` instead of the bracket-less
265 // block sequence `- - x` that a verbatim re-emit would produce and that
266 // destroys the link. See [`canonicalize_extra_value`].
267 for (k, v) in &self.extra {
268 map.insert(Value::String(k.clone()), canonicalize_extra_value(v));
269 }
270
271 if let Some(status) = &self.status {
272 map.insert(
273 Value::String("status".into()),
274 Value::String(status.clone()),
275 );
276 }
277 if !self.tags.is_empty() {
278 map.insert(
279 Value::String("tags".into()),
280 Value::Sequence(self.tags.iter().cloned().map(Value::String).collect()),
281 );
282 }
283
284 if map.is_empty() {
285 return String::new();
286 }
287 serde_norway::to_string(&Value::Mapping(map)).unwrap_or_default()
288 }
289
290 /// True if the file is content (under `sources/`, `records/`, or `wiki/`)
291 /// and not an `index.md`. Used by validate to decide which files require a
292 /// `summary`. Meta files (`DB.md`, `index.md`, `log.md`) return false.
293 pub fn is_content_file(path: &Path) -> bool {
294 // index.md is a meta file at every level, never content.
295 if path.file_name().and_then(|n| n.to_str()) == Some("index.md") {
296 return false;
297 }
298 // Content iff some path component is one of the three layer dirs. This
299 // works for both store-relative (`sources/emails/x.md`) and absolute
300 // (`/home/db/sources/emails/x.md`) paths. DB.md / log.md sit at the
301 // root, under no layer, so they fall through to false.
302 path.components().any(|c| {
303 c.as_os_str()
304 .to_str()
305 .is_some_and(|s| LAYER_DIRS.contains(&s))
306 })
307 }
308
309 /// Resolve the file's effective `id`: the explicit `id` field if present,
310 /// otherwise derived from the store-relative path (filename without `.md`).
311 pub fn effective_id(&self, store_relative_path: &Path) -> String {
312 if let Some(id) = &self.id {
313 if !id.is_empty() {
314 return id.clone();
315 }
316 }
317 // Derived id = filename without the `.md` extension.
318 store_relative_path
319 .file_stem()
320 .and_then(|s| s.to_str())
321 .unwrap_or_default()
322 .to_string()
323 }
324
325 /// Read a single frontmatter key as a raw YAML [`Value`], looking in the
326 /// typed fields first and then [`extra`](Frontmatter::extra).
327 pub fn get(&self, key: &str) -> Option<Value> {
328 match key {
329 "type" => self.type_.clone().map(Value::String),
330 "id" => self.id.clone().map(Value::String),
331 "created" => self.created.map(|d| Value::String(d.to_rfc3339())),
332 "updated" => self.updated.map(|d| Value::String(d.to_rfc3339())),
333 "summary" => self.summary.clone().map(Value::String),
334 "status" => self.status.clone().map(Value::String),
335 "tags" => {
336 if self.tags.is_empty() {
337 None
338 } else {
339 Some(Value::Sequence(
340 self.tags.iter().cloned().map(Value::String).collect(),
341 ))
342 }
343 }
344 _ => self.extra.get(key).cloned(),
345 }
346 }
347
348 /// Set a single frontmatter key from a string value, routing universal-
349 /// contract keys to their typed fields and everything else to
350 /// [`extra`](Frontmatter::extra). Used by `dbmd fm set`.
351 pub fn set(&mut self, key: &str, value: &str) -> Result<(), ParseError> {
352 match key {
353 "type" => self.type_ = Some(value.to_string()),
354 "id" => self.id = Some(value.to_string()),
355 "created" => {
356 self.created = Some(parse_rfc3339(value, "created", Path::new("<fm set>"))?)
357 }
358 "updated" => {
359 self.updated = Some(parse_rfc3339(value, "updated", Path::new("<fm set>"))?)
360 }
361 "summary" => self.summary = Some(value.to_string()),
362 "status" => self.status = Some(value.to_string()),
363 "tags" => {
364 // Accept either a YAML flow list (`[a, b]`) or a single scalar
365 // tag. Anything that parses to a sequence becomes the tag list;
366 // otherwise the whole string is one tag.
367 self.tags = match serde_norway::from_str::<Value>(value) {
368 Ok(Value::Sequence(seq)) => parse_tags(&Value::Sequence(seq)),
369 _ => vec![value.to_string()],
370 };
371 }
372 _ => {
373 // A custom / type-specific field. The value is a scalar string by
374 // default, but the spec's list-valued link fields (e.g.
375 // `meeting.attendees`, SPEC § Linking) must serialize as a YAML
376 // block sequence of quoted wiki-links — never the flow-form string
377 // `"[[[a]], [[b]]]"`, which `dbmd validate` rejects as
378 // `WIKI_LINK_FLOW_FORM_LIST`. When the value parses as a YAML
379 // sequence whose every item is a clean single wiki-link, store the
380 // canonical sequence so `to_yaml` emits block form. Everything else
381 // — plain text, and a single inline `[[x]]` (which YAML reads as a
382 // nested `Seq[Seq[String]]`, not a list of link strings) — stays a
383 // verbatim scalar string, preserving the prior behavior.
384 let stored = parse_link_list_value(value)
385 .unwrap_or_else(|| Value::String(value.to_string()));
386 self.extra.insert(key.to_string(), stored);
387 }
388 }
389 Ok(())
390 }
391
392 /// Extract every frontmatter field whose value is a wiki-link (scalar
393 /// inline form or a block-sequence list), pairing each with its key. The
394 /// validate engine checks these against `(link)` schema annotations.
395 pub fn link_fields(&self) -> Vec<(String, WikiLink)> {
396 let mut out = Vec::new();
397 // `summary` may carry navigational wiki-links (spec encourages it).
398 if let Some(summary) = &self.summary {
399 for link in extract_wiki_links(summary, Path::new("")) {
400 out.push(("summary".to_string(), link));
401 }
402 }
403 // Every type-specific / custom field: a scalar wiki-link or a list of
404 // wiki-links, in either the quoted (`"[[x]]"`) or the canonical unquoted
405 // (`[[x]]`) form. See [`links_in_field_value`] for the YAML shapes.
406 for (key, value) in &self.extra {
407 for link in links_in_field_value(value) {
408 out.push((key.clone(), link));
409 }
410 }
411 out
412 }
413}
414
415/// A wiki-link reference inside the store: `[[target]]` or `[[target|display]]`.
416///
417/// `target` is always recorded as written; [`is_full_path`](WikiLink::is_full_path)
418/// flags whether it's a full store-relative path (the doctrine) versus a
419/// short-form (a validation error).
420#[derive(Debug, Clone, PartialEq, Eq)]
421pub struct WikiLink {
422 /// The link target as written, without the `[[ ]]` and without `|display`.
423 pub target: String,
424 /// The optional `|display` text override.
425 pub display: Option<String>,
426 /// True when `target` is a full store-relative path (contains a `/` and
427 /// resolves under a known layer); false for short-form targets like
428 /// `sarah-chen` — which validate reports as `WIKI_LINK_SHORT_FORM`.
429 pub is_full_path: bool,
430 /// True when `target` carries a trailing `.md` extension — validate warns
431 /// `WIKI_LINK_HAS_EXTENSION`; the canonical writers emit the bare form.
432 pub has_md_extension: bool,
433 /// Where the link appears: `(file, line, col)`, 1-based line and column.
434 pub location: (PathBuf, u32, u32),
435}
436
437/// A standard markdown link `[text](url)` — an external reference, kept in a
438/// stream separate from [`WikiLink`] so external targets are visible to the
439/// toolkit without being conflated with in-store edges. Not graph-validated.
440#[derive(Debug, Clone, PartialEq, Eq)]
441pub struct MarkdownLink {
442 /// The link text inside `[ ]`.
443 pub text: String,
444 /// The URL or path inside `( )`.
445 pub url: String,
446 /// Where the link appears: `(file, line, col)`, 1-based.
447 pub location: (PathBuf, u32, u32),
448}
449
450/// A `##`/`###` section of a markdown body: the heading text plus the byte
451/// slice of the body it spans (heading line through the line before the next
452/// heading of equal-or-shallower depth).
453#[derive(Debug, Clone, PartialEq, Eq)]
454pub struct Section {
455 /// The heading text (without the leading `#`s).
456 pub heading: String,
457 /// Heading depth (number of leading `#`s).
458 pub level: u8,
459 /// The 1-based line where the heading appears.
460 pub line: u32,
461 /// The section body, from the heading line to the next sibling-or-shallower
462 /// heading (exclusive), as a slice of the original body.
463 pub body: String,
464}
465
466/// The parsed structured content of a store's `DB.md` config file.
467///
468/// All four parts are optional in the source; absent parts fall back to spec
469/// defaults. Produced by [`parse_db_md`].
470#[derive(Debug, Clone, Default, PartialEq)]
471pub struct Config {
472 /// Body of the `## Agent instructions` section — free-form prose passed to
473 /// the agent's system prompt.
474 pub agent_instructions: Option<String>,
475 /// `## Policies` → `### Frozen pages`: store-relative paths the toolkit
476 /// refuses to write (`POLICY_FROZEN_PAGE`).
477 pub frozen_pages: Vec<PathBuf>,
478 /// `## Policies` → `### Ignored types`: type names the curator never
479 /// synthesizes (still readable as ambient context).
480 pub ignored_types: Vec<String>,
481 /// `## Schemas` → one entry per `### <type>` sub-section.
482 pub schemas: BTreeMap<String, Schema>,
483}
484
485impl Config {
486 /// The `### Frozen pages` entry that matches a store-relative `target`, if
487 /// any. The **single** frozen-page matcher every write surface must funnel
488 /// through so the policy is enforced identically on `write` / `fm set` /
489 /// `fm init` / `link` / `rename` / `format`.
490 ///
491 /// Comparison is normalized so a policy line and a write target match
492 /// regardless of incidental spelling differences:
493 /// - `/` path separators on every OS,
494 /// - a single leading `./` dropped,
495 /// - a trailing `.md` dropped on **both** sides — `parse_db_md` stores
496 /// frozen entries verbatim, so an operator who writes the natural
497 /// extensionless spelling (`records/decisions/q1`) must protect the file
498 /// (`records/decisions/q1.md`) exactly as the `.md` spelling does.
499 ///
500 /// Returns the matched config entry verbatim (its original spelling) so the
501 /// caller can name it in the `POLICY_FROZEN_PAGE` refusal.
502 pub fn frozen_match(&self, target: &Path) -> Option<PathBuf> {
503 let want = normalize_frozen_path(target);
504 self.frozen_pages
505 .iter()
506 .find(|frozen| {
507 let pat = normalize_frozen_path(frozen);
508 // A literal entry matches by exact normalized equality; an entry
509 // carrying a `*`/`**` glob matches by segment-wise glob so a
510 // pattern like `records/decisions/*` actually protects the
511 // concrete files under it instead of silently failing open.
512 if pat.contains('*') {
513 frozen_glob_matches(&pat, &want)
514 } else {
515 pat == want
516 }
517 })
518 .cloned()
519 }
520
521 /// True if `target` (store-relative) is a frozen page. Convenience wrapper
522 /// over [`Config::frozen_match`] for callers that only need presence.
523 pub fn is_frozen(&self, target: &Path) -> bool {
524 self.frozen_match(target).is_some()
525 }
526}
527
528/// Normalize a path for frozen-page comparison: `/` separators, a leading `./`
529/// or `/` dropped, and a trailing `.md` dropped. Both the policy entry and the
530/// write target pass through this before equality/glob, so the match is
531/// separator-, `./`-, leading-`/`-, and `.md`-insensitive. Without the leading
532/// `/` drop, an operator who wrote `/records/decisions/q1.md` normalized to a
533/// path that never equals the target's `records/decisions/q1`, silently failing
534/// the freeze OPEN.
535fn normalize_frozen_path(p: &Path) -> String {
536 use std::path::Component;
537 // Keep only the `Normal` path segments, dropping `RootDir`/`Prefix` (a
538 // leading `/` or drive prefix) and `CurDir` (`.`). This is what makes a
539 // leading-slash entry (`/records/decisions/q1.md`) normalize to the same
540 // `records/decisions/q1` as the store-relative target, instead of the
541 // doubled-`//` prefix `Path::components` + naive join produced — which never
542 // equalled the target and silently failed the freeze OPEN.
543 let unix: String = p
544 .components()
545 .filter_map(|c| match c {
546 Component::Normal(s) => s.to_str(),
547 _ => None,
548 })
549 .collect::<Vec<_>>()
550 .join("/");
551 unix.strip_suffix(".md").unwrap_or(&unix).to_string()
552}
553
554/// Match a normalized frozen-page glob `pat` against a normalized target `path`,
555/// segment by segment. `*` matches any run of characters *within a single path
556/// segment* (never crossing `/`); `**` as a whole segment matches zero or more
557/// whole segments. Both sides are already `normalize_frozen_path`-normalized, so
558/// this only deals with `/`-joined segment text. Keeps the substrate dependency-
559/// free (no glob crate) while making `records/decisions/*` actually freeze the
560/// files beneath it instead of failing open.
561fn frozen_glob_matches(pat: &str, path: &str) -> bool {
562 let pat_segs: Vec<&str> = pat.split('/').collect();
563 let path_segs: Vec<&str> = path.split('/').collect();
564 glob_segments(&pat_segs, &path_segs)
565}
566
567/// Recursive segment matcher for [`frozen_glob_matches`]. `**` consumes any
568/// number of path segments; every other pattern segment must match exactly one
569/// path segment (with `*` wildcards inside it).
570fn glob_segments(pat: &[&str], path: &[&str]) -> bool {
571 match pat.split_first() {
572 None => path.is_empty(),
573 Some((&"**", rest_pat)) => {
574 // `**` matches zero segments here, or one-or-more by consuming a path
575 // segment and recursing on the same `**`.
576 if glob_segments(rest_pat, path) {
577 return true;
578 }
579 !path.is_empty() && glob_segments(pat, &path[1..])
580 }
581 Some((&first_pat, rest_pat)) => match path.split_first() {
582 Some((&first_path, rest_path)) => {
583 glob_segment_text(first_pat, first_path) && glob_segments(rest_pat, rest_path)
584 }
585 None => false,
586 },
587 }
588}
589
590/// Match a single glob segment against a single path segment. `*` matches any
591/// run of characters within the segment; all other characters are literal.
592fn glob_segment_text(pat: &str, seg: &str) -> bool {
593 if !pat.contains('*') {
594 return pat == seg;
595 }
596 // Split on `*` into literal fragments. The first fragment must be a prefix,
597 // the last a suffix, and the middle fragments must appear in order.
598 let parts: Vec<&str> = pat.split('*').collect();
599 let mut pos = 0usize;
600 for (idx, part) in parts.iter().enumerate() {
601 if part.is_empty() {
602 continue;
603 }
604 if idx == 0 {
605 // Leading literal must be a prefix.
606 if !seg[pos..].starts_with(part) {
607 return false;
608 }
609 pos += part.len();
610 } else if idx == parts.len() - 1 {
611 // Trailing literal must be a suffix at or after the current cursor.
612 return seg[pos..].ends_with(part);
613 } else {
614 // Interior literal: find it at or after the cursor.
615 match seg[pos..].find(part) {
616 Some(off) => pos += off + part.len(),
617 None => return false,
618 }
619 }
620 }
621 true
622}
623
624/// A user-declared type schema parsed from a `DB.md` `### <type>` sub-section.
625/// The store's `## Schemas` is the **only** source of schema enforcement — the
626/// toolkit ships no built-in or implicit per-type schema (see SPEC § Schemas).
627#[derive(Debug, Clone, Default, PartialEq)]
628pub struct Schema {
629 /// One [`FieldSpec`] per bulleted field line, in source order.
630 pub fields: Vec<FieldSpec>,
631 /// `- unique: <field>[, <field> …]` directives — each inner vec is one
632 /// uniqueness constraint over the listed field(s) (compound when >1). Two
633 /// records of this type whose listed values collide warn as
634 /// `DUP_UNIQUE_KEY`.
635 pub unique_keys: Vec<Vec<String>>,
636 /// `- summary_template: <template>` directive — the `{field}` interpolation
637 /// pattern `dbmd fm init` / `dbmd write` use to compose a default `summary`
638 /// for this type. `None` falls back to the body's first paragraph.
639 pub summary_template: Option<String>,
640 /// `- shard: by-date | flat` directive — whether records of this type are
641 /// date-sharded on disk (`records/<type>/<YYYY>/<MM>/…`) or kept flat.
642 /// `None` = no directive declared, so the store's built-in default for the
643 /// type applies ([`crate::store::Store::type_shards`]); `Some(true)` forces
644 /// date-sharding (e.g. a custom event type the toolkit has no built-in for);
645 /// `Some(false)` forces flat. This is the v0.2 generic-model way to declare
646 /// sharding — the toolkit ships no implicit per-type behavior beyond the
647 /// example-type defaults.
648 pub shard: Option<bool>,
649}
650
651/// One field declaration inside a [`Schema`]: `- <name> (<modifiers>)`.
652///
653/// Modifiers are comma-separated inside the parens; this captures the
654/// recognized ones as typed fields and stashes anything unrecognized in
655/// [`unknown_modifiers`](FieldSpec::unknown_modifiers) (surfaced as `Info`).
656#[derive(Debug, Clone, Default, PartialEq)]
657pub struct FieldSpec {
658 /// The field name.
659 pub name: String,
660 /// `required` modifier present.
661 pub required: bool,
662 /// The shape modifier (`string`/`int`/`bool`/`date`/`email`/`currency`/
663 /// `url`), if any.
664 pub shape: Option<Shape>,
665 /// `link to <prefix>/` — the store-relative prefix a wiki-link target must
666 /// start with. The trailing slash is required in the source syntax.
667 pub link_prefix: Option<PathBuf>,
668 /// `default <value>` — the value written when the field is absent.
669 pub default: Option<Value>,
670 /// `enum: <v1>, <v2>, ...` — the allowed values (must be the last modifier
671 /// on the line because of its own commas).
672 pub enum_values: Option<Vec<String>>,
673 /// Any modifiers not in the recognized vocabulary, preserved verbatim;
674 /// validate surfaces these as `Info`, never errors.
675 pub unknown_modifiers: Vec<String>,
676}
677
678/// A recognized shape modifier for a schema field. Validate enforces the
679/// corresponding value shape (`SCHEMA_SHAPE_MISMATCH` on violation).
680#[derive(Debug, Clone, Copy, PartialEq, Eq)]
681pub enum Shape {
682 /// Any scalar string.
683 String,
684 /// Integer.
685 Int,
686 /// Boolean.
687 Bool,
688 /// RFC3339 / ISO-8601 date.
689 Date,
690 /// `<local>@<domain>` email address.
691 Email,
692 /// A currency amount.
693 Currency,
694 /// A URL.
695 Url,
696}
697
698/// The result of splitting a raw file into its frontmatter block and body.
699///
700/// `body` is the verbatim remainder after the closing `---` fence — the writer
701/// preserves it byte-for-byte so operator edits are never reflowed.
702#[derive(Debug, Clone, PartialEq, Eq)]
703pub struct ParsedFile {
704 /// The raw frontmatter YAML (between the fences, exclusive of them).
705 pub frontmatter_yaml: String,
706 /// The verbatim body (everything after the closing `---`).
707 pub body: String,
708}
709
710/// Split a file's full text into its frontmatter block and body. The
711/// frontmatter block must be the very first thing in the file, delimited by
712/// `---` on its own line at start and end. Returns
713/// [`ParseError::MissingFrontmatter`] if absent.
714pub fn split_frontmatter(text: &str, file: &Path) -> Result<ParsedFile, ParseError> {
715 // Tolerate a single leading UTF-8 BOM (U+FEFF) before the opening fence,
716 // matching `store::frontmatter_block` and `index::extract_frontmatter_block`
717 // which already strip it. Without this, a BOM-prefixed file (common from
718 // Windows / exported markdown dropped into `sources/`) gets walked and
719 // indexed by `dbmd index` yet hard-fails every write/edit surface that
720 // routes through `read_file` (`fm get/set`, `format`, `link`, `write`). The
721 // BOM is dropped from the emitted body so the canonical writer never carries
722 // it forward.
723 let text = text.strip_prefix('\u{feff}').unwrap_or(text);
724
725 // The opening fence must be the very first line: `---`, no leading
726 // whitespace, nothing before it. Trailing whitespace on the fence line is
727 // tolerated via `trim_end()` (which strips spaces/tabs as well as CR/LF) so
728 // this matches `index::extract_frontmatter_block` and
729 // `validate::split_frontmatter`, both of which use `trim_end()`. Without this
730 // agreement a fence written `--- ` (a single trailing space — invisible in an
731 // editor, easily produced by hand edits or exporters) was indexed and
732 // validated clean by those scanners yet hard-failed every write/edit surface
733 // routed through `read_file` (`fm get/set`, `format`, `link`, `write`) — the
734 // same cross-scanner drift class already fixed for the UTF-8 BOM above.
735 let mut lines = text.split_inclusive('\n');
736 let first = lines.next().unwrap_or("");
737 if first.trim_end() != "---" {
738 return Err(ParseError::MissingFrontmatter {
739 file: file.to_path_buf(),
740 });
741 }
742
743 // Scan for the closing fence line. Track byte offsets so we can slice the
744 // YAML (between fences, exclusive) and the body (verbatim, after the
745 // closing fence's line terminator).
746 let opening_len = first.len();
747 let mut offset = opening_len;
748 for line in lines {
749 if line.trim_end() == "---" {
750 let yaml = &text[opening_len..offset];
751 let body_start = offset + line.len();
752 let body = &text[body_start..];
753 return Ok(ParsedFile {
754 frontmatter_yaml: yaml.to_string(),
755 body: body.to_string(),
756 });
757 }
758 offset += line.len();
759 }
760
761 // Opening fence present but no closing fence: malformed frontmatter block.
762 Err(ParseError::MissingFrontmatter {
763 file: file.to_path_buf(),
764 })
765}
766
767/// Read a file from disk and parse it into typed [`Frontmatter`] plus the
768/// verbatim body string.
769pub fn read_file(path: &Path) -> Result<(Frontmatter, String), ParseError> {
770 let text = std::fs::read_to_string(path)?;
771 let parsed = split_frontmatter(&text, path)?;
772 let fm = Frontmatter::parse(&parsed.frontmatter_yaml, path)?;
773 Ok((fm, parsed.body))
774}
775
776/// Atomically write a markdown file from frontmatter + body: emit the
777/// frontmatter in canonical key order, then the body verbatim, via a
778/// temp-file-rename so a reader never sees a half-written file. Preserves the
779/// operator-edited body exactly as given.
780pub fn write_file(path: &Path, frontmatter: &Frontmatter, body: &str) -> Result<(), ParseError> {
781 let contents = render_file(frontmatter, body);
782
783 // One durable, atomic write for all primary data (see `crate::fsx`):
784 // temp-file + fsync + rename + parent-fsync. Content records are primary
785 // data, so they get the durable path (unlike the rebuildable index).
786 crate::fsx::write_atomic(path, contents.as_bytes())?;
787 Ok(())
788}
789
790/// Atomically create a markdown file from frontmatter + body, refusing with
791/// [`std::io::ErrorKind::AlreadyExists`] if the destination already exists.
792///
793/// This is the create-new sibling of [`write_file`]: same canonical rendering
794/// and durable temp-file path, but backed by [`crate::fsx::write_atomic_new`] so
795/// two concurrent creators for the same path cannot both succeed.
796pub fn write_file_new(
797 path: &Path,
798 frontmatter: &Frontmatter,
799 body: &str,
800) -> Result<(), ParseError> {
801 let contents = render_file(frontmatter, body);
802 crate::fsx::write_atomic_new(path, contents.as_bytes())?;
803 Ok(())
804}
805
806fn render_file(frontmatter: &Frontmatter, body: &str) -> String {
807 let yaml = frontmatter.to_yaml();
808 // `to_yaml` already terminates each block with a newline. Compose the file
809 // as: opening fence, frontmatter YAML, closing fence, then body verbatim.
810 let mut contents = String::with_capacity(yaml.len() + body.len() + 8);
811 contents.push_str("---\n");
812 contents.push_str(&yaml);
813 contents.push_str("---\n");
814 contents.push_str(body);
815 contents
816}
817
818/// Extract every wiki-link from a body (and inline frontmatter), returning the
819/// structured [`WikiLink`] stream with short-form / `.md`-extension flags and
820/// `(file, line, col)` locations set.
821pub fn extract_wiki_links(body: &str, file: &Path) -> Vec<WikiLink> {
822 static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
823 let re = RE.get_or_init(|| {
824 // [[target]] or [[target|display]]; target/display exclude brackets and
825 // (for target) the `|` separator so nested forms don't over-match.
826 regex::Regex::new(r"\[\[([^\[\]|]+?)(?:\|([^\[\]]*))?\]\]").expect("valid wiki-link regex")
827 });
828
829 let mut out = Vec::new();
830 for (line_idx, line) in body.lines().enumerate() {
831 for caps in re.captures_iter(line) {
832 let whole = caps.get(0).expect("group 0 always present");
833 let target = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
834 let display = caps.get(2).map(|m| m.as_str().to_string());
835 out.push(WikiLink {
836 is_full_path: target_is_full_path(&target),
837 has_md_extension: target_has_md_extension(&target),
838 target,
839 display,
840 location: (
841 file.to_path_buf(),
842 (line_idx as u32) + 1,
843 char_column(line, whole.start()),
844 ),
845 });
846 }
847 }
848 out
849}
850
851/// Extract every standard markdown link `[text](url)` from a body into a
852/// separate stream, kept distinct from wiki-links.
853pub fn extract_markdown_links(body: &str, file: &Path) -> Vec<MarkdownLink> {
854 static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
855 let re = RE.get_or_init(|| {
856 // [text](url). `text` excludes brackets so a wiki-link `[[x]]` (which
857 // has `]]`, not `](`) never matches; `url` excludes `)` and whitespace.
858 regex::Regex::new(r"\[([^\[\]]*)\]\(([^)\s]*)\)").expect("valid markdown-link regex")
859 });
860
861 let mut out = Vec::new();
862 for (line_idx, line) in body.lines().enumerate() {
863 for caps in re.captures_iter(line) {
864 let whole = caps.get(0).expect("group 0 always present");
865 out.push(MarkdownLink {
866 text: caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string(),
867 url: caps.get(2).map(|m| m.as_str()).unwrap_or("").to_string(),
868 location: (
869 file.to_path_buf(),
870 (line_idx as u32) + 1,
871 char_column(line, whole.start()),
872 ),
873 });
874 }
875 }
876 out
877}
878
879/// Detect the frontmatter wiki-link-list mis-encoding: a wiki-link *list*
880/// written so YAML parses it as nested sequences instead of a clean list of
881/// strings. Returns the offending keys so validate can emit
882/// `WIKI_LINK_FLOW_FORM_LIST`.
883///
884/// The subtlety is that `[[x]]` is YAML for "a list containing `[x]`", so the
885/// shapes nest:
886///
887/// - **Scalar inline** `company: [[records/x]]` → `Seq[ Seq[String] ]`
888/// (double-nested). This is the spec's scalar wiki-link form — NOT flagged.
889/// - **Flow list** `attendees: [[[a]], [[b]]]` → `Seq[ Seq[Seq[String]], … ]`
890/// (triple-nested). The list mis-encoding — flagged.
891/// - **Unquoted block list** (`- [[a]]` per line) → also triple-nested, so it
892/// is flagged too; the canonical list form must quote each item
893/// (`- "[[a]]"`), which parses to a clean `Seq[String, …]` and is NOT flagged.
894///
895/// So the discriminator is nesting depth: a *list* mis-encoding has at least one
896/// item that is itself a sequence-of-sequences, whereas a scalar inline link's
897/// single item is a sequence-of-scalars.
898pub fn detect_flow_form_link_lists(frontmatter_yaml: &str) -> Vec<String> {
899 let value: Value = match serde_norway::from_str(frontmatter_yaml) {
900 Ok(v) => v,
901 // Malformed YAML is FM_MALFORMED_YAML's job, not ours; report nothing.
902 Err(_) => return Vec::new(),
903 };
904 let Value::Mapping(map) = value else {
905 return Vec::new();
906 };
907
908 let mut out = Vec::new();
909 for (k, v) in &map {
910 if let Value::Sequence(items) = v {
911 // Triple-nesting: some outer item is a sequence that itself holds a
912 // sequence. Scalar inline `[[x]]` is only double-nested, so it
913 // never matches.
914 let is_link_list = items.iter().any(|item| match item {
915 Value::Sequence(inner) => inner.iter().any(|x| matches!(x, Value::Sequence(_))),
916 _ => false,
917 });
918 if is_link_list {
919 if let Some(key) = k.as_str() {
920 out.push(key.to_string());
921 }
922 }
923 }
924 }
925 out
926}
927
928/// Extract the `##`/`###` sections of a markdown body into a flat list with
929/// body slices.
930pub fn extract_sections(body: &str) -> Vec<Section> {
931 // Keep each line's start so we can slice the body verbatim (exact newlines).
932 let lines: Vec<&str> = body.split_inclusive('\n').collect();
933
934 // First pass: classify heading levels (0 = not a heading), honoring fenced
935 // code blocks so a `## x` inside a ``` fence is not treated as a heading.
936 let mut levels: Vec<u8> = Vec::with_capacity(lines.len());
937 let mut fence: Option<(u8, usize)> = None;
938 for line in &lines {
939 let content = line.trim_end_matches(['\n', '\r']);
940 if let Some(f) = fence {
941 if is_closing_fence(content, f) {
942 fence = None;
943 }
944 levels.push(0);
945 continue;
946 }
947 if let Some(opened) = opening_fence(content) {
948 fence = Some(opened);
949 levels.push(0);
950 continue;
951 }
952 levels.push(heading_level(content));
953 }
954
955 // Second pass: emit `##`+ headings; each section body runs from its heading
956 // line to the next heading at an equal-or-shallower level (exclusive).
957 let mut sections = Vec::new();
958 for (i, &lvl) in levels.iter().enumerate() {
959 if lvl < 2 {
960 continue;
961 }
962 let heading_line = lines[i].trim_end_matches(['\n', '\r']);
963 let heading = heading_text(heading_line, lvl);
964
965 let mut end = lines.len();
966 for (j, &other) in levels.iter().enumerate().skip(i + 1) {
967 if other != 0 && other <= lvl {
968 end = j;
969 break;
970 }
971 }
972
973 sections.push(Section {
974 heading,
975 level: lvl,
976 line: (i + 1) as u32,
977 body: lines[i..end].concat(),
978 });
979 }
980 sections
981}
982
983/// Extract the `##`/`###` sections of a **whole file** (frontmatter + body),
984/// returning each [`Section`] with `line` numbered against the *source file*,
985/// not the body.
986///
987/// [`extract_sections`] numbers headings 1-based within the body it is handed —
988/// the right frame for callers that already track the frontmatter offset
989/// (`validate` adds `fm_end_line`). But the single-file views (`dbmd sections`,
990/// `dbmd outline`) present `Section::line` as a source line an agent can jump to;
991/// because every db.md file opens with a frontmatter block, the body-relative
992/// number is off by the block's length (`opening fence + frontmatter lines +
993/// closing fence`) for every file. This helper does the offset once, in the
994/// parser, so those surfaces report true file lines. A file with no leading
995/// frontmatter block is treated as all-body (offset 0), so the function never
996/// fails just because a file lacks frontmatter.
997pub fn extract_sections_in_file(text: &str) -> Vec<Section> {
998 // Tolerate a leading BOM the same way `split_frontmatter` does, so the line
999 // count and the body slice agree with the read path.
1000 let text = text.strip_prefix('\u{feff}').unwrap_or(text);
1001
1002 // Find the body and how many source lines precede it. The body begins right
1003 // after the closing fence; the number of lines consumed by the frontmatter
1004 // block (both fences + the YAML between) is the offset to add to each
1005 // body-relative heading line.
1006 let (body, offset) = match split_frontmatter(text, Path::new("<sections>")) {
1007 Ok(parsed) => {
1008 // Lines before the body = total lines in `text` minus lines in body.
1009 let total_lines = count_lines(text);
1010 let body_lines = count_lines(&parsed.body);
1011 (parsed.body, total_lines.saturating_sub(body_lines))
1012 }
1013 // No frontmatter block: the whole text is body, no offset.
1014 Err(_) => (text.to_string(), 0),
1015 };
1016
1017 let mut sections = extract_sections(&body);
1018 for s in &mut sections {
1019 s.line += offset;
1020 }
1021 sections
1022}
1023
1024/// Count the number of lines a string spans for line-number offsetting: one line
1025/// per `\n`, plus one more for a final line with no trailing newline. An empty
1026/// string is zero lines.
1027fn count_lines(s: &str) -> u32 {
1028 if s.is_empty() {
1029 return 0;
1030 }
1031 let newlines = s.bytes().filter(|&b| b == b'\n').count() as u32;
1032 if s.ends_with('\n') {
1033 newlines
1034 } else {
1035 newlines + 1
1036 }
1037}
1038
1039/// Parse a store's `DB.md` file into a [`Config`]: the `## Agent instructions`
1040/// prose, `## Policies` (`### Frozen pages` + `### Ignored types`), and
1041/// `## Schemas` (`### <type>` field-bullet blocks). Unrecognized sections are
1042/// ignored; absent sections leave their [`Config`] fields at default.
1043pub fn parse_db_md(text: &str, file: &Path) -> Result<Config, ParseError> {
1044 // The structured sections live in the body (after frontmatter). DB.md must
1045 // still start with a valid `---` block (`type: db-md`); if it's missing we
1046 // surface MissingFrontmatter like any other file.
1047 let parsed = split_frontmatter(text, file)?;
1048 let _frontmatter = Frontmatter::parse(&parsed.frontmatter_yaml, file)?;
1049 let sections = extract_sections(&parsed.body);
1050
1051 let mut config = Config::default();
1052 // Track which H2 region each H3 belongs to as we walk the flat list.
1053 let mut current_h2: Option<String> = None;
1054
1055 for section in §ions {
1056 match section.level {
1057 2 => {
1058 let name = section.heading.trim().to_ascii_lowercase();
1059 current_h2 = Some(name.clone());
1060 if name == "agent instructions" {
1061 let prose = section_prose(§ion.body);
1062 if !prose.is_empty() {
1063 config.agent_instructions = Some(prose);
1064 }
1065 }
1066 }
1067 3 => {
1068 let h2 = current_h2.as_deref().unwrap_or("");
1069 let h3 = section.heading.trim().to_ascii_lowercase();
1070 match (h2, h3.as_str()) {
1071 ("policies", "frozen pages") => {
1072 config.frozen_pages = bullet_lines(§ion.body)
1073 .into_iter()
1074 .map(|b| PathBuf::from(extract_path_bullet(&b)))
1075 .collect();
1076 }
1077 ("policies", "ignored types") => {
1078 config.ignored_types = bullet_lines(§ion.body)
1079 .into_iter()
1080 .flat_map(|b| extract_type_list_bullet(&b))
1081 .collect();
1082 }
1083 ("schemas", _) => {
1084 // The H3 heading text (as written) is the type name.
1085 let type_name = section.heading.trim().to_string();
1086 let mut schema = Schema::default();
1087 for b in bullet_lines(§ion.body) {
1088 match parse_schema_bullet(&b) {
1089 SchemaBullet::Field(f) => schema.fields.push(f),
1090 SchemaBullet::Unique(k) if !k.is_empty() => {
1091 schema.unique_keys.push(k)
1092 }
1093 SchemaBullet::SummaryTemplate(t) if !t.is_empty() => {
1094 schema.summary_template = Some(t)
1095 }
1096 SchemaBullet::Shard(Some(b)) => schema.shard = Some(b),
1097 // Empty `unique:`/`summary_template:`, or a `shard:`
1098 // with an unrecognized value — ignored.
1099 SchemaBullet::Unique(_)
1100 | SchemaBullet::SummaryTemplate(_)
1101 | SchemaBullet::Shard(None) => {}
1102 }
1103 }
1104 config.schemas.insert(type_name, schema);
1105 }
1106 _ => {}
1107 }
1108 }
1109 _ => {}
1110 }
1111 }
1112
1113 Ok(config)
1114}
1115
1116/// One parsed bullet inside a `### <type>` schema block: an ordinary field, or a
1117/// reserved directive (`unique:` / `summary_template:` / `shard:`). The names
1118/// `unique`, `summary_template`, and `shard` are reserved and cannot be used as
1119/// field names.
1120#[derive(Debug)]
1121enum SchemaBullet {
1122 /// An ordinary `- <name> (<modifiers>)` field.
1123 Field(FieldSpec),
1124 /// `- unique: <field>[, <field> …]` — a (possibly compound) uniqueness key.
1125 Unique(Vec<String>),
1126 /// `- summary_template: <template>` — the default-`summary` pattern.
1127 SummaryTemplate(String),
1128 /// `- shard: by-date | flat` — date-shard records of this type, or keep them
1129 /// flat. `None` = an unrecognized value, ignored like an unknown modifier.
1130 Shard(Option<bool>),
1131}
1132
1133/// Classify one `## Schemas` bullet as a directive or a field. The directive
1134/// forms are `- unique: a, b, …` and `- summary_template: …`; the keyword check
1135/// guards against false positives — a field like `- status (enum: a, b)` has a
1136/// `(` before any `:`, so its head isn't a bare reserved keyword and it parses
1137/// as a [`FieldSpec`].
1138fn parse_schema_bullet(bullet_line: &str) -> SchemaBullet {
1139 let line = bullet_line.trim();
1140 let line = line
1141 .strip_prefix("- ")
1142 .or_else(|| line.strip_prefix("* "))
1143 .or_else(|| line.strip_prefix("+ "))
1144 .or_else(|| line.strip_prefix('-'))
1145 .unwrap_or(line)
1146 .trim();
1147
1148 if let Some((head, rest)) = line.split_once(':') {
1149 match head.trim().to_ascii_lowercase().as_str() {
1150 "unique" => {
1151 let fields = rest
1152 .split(',')
1153 .map(|f| f.trim().to_string())
1154 .filter(|f| !f.is_empty())
1155 .collect();
1156 return SchemaBullet::Unique(fields);
1157 }
1158 "summary_template" => {
1159 return SchemaBullet::SummaryTemplate(rest.trim().to_string());
1160 }
1161 "shard" => {
1162 // `by-date` (synonyms: date/sharded/true) enables date-sharding;
1163 // `flat` (none/false) forces flat; anything else is ignored.
1164 let v = match rest.trim().to_ascii_lowercase().as_str() {
1165 "by-date" | "date" | "sharded" | "true" => Some(true),
1166 "flat" | "none" | "false" => Some(false),
1167 _ => None,
1168 };
1169 return SchemaBullet::Shard(v);
1170 }
1171 _ => {}
1172 }
1173 }
1174
1175 SchemaBullet::Field(parse_field_spec(bullet_line))
1176}
1177
1178/// Parse a single `## Schemas` field-bullet line — `- <name> (<modifiers>)` —
1179/// into a [`FieldSpec`], capturing recognized modifiers and stashing the rest
1180/// in [`FieldSpec::unknown_modifiers`].
1181pub fn parse_field_spec(bullet_line: &str) -> FieldSpec {
1182 // Strip the leading bullet marker (`- ` / `* ` / `+ `) and surrounding ws.
1183 let line = bullet_line.trim();
1184 let line = line
1185 .strip_prefix("- ")
1186 .or_else(|| line.strip_prefix("* "))
1187 .or_else(|| line.strip_prefix("+ "))
1188 .or_else(|| line.strip_prefix('-'))
1189 .unwrap_or(line)
1190 .trim();
1191
1192 // Split `<name> (<modifiers>)` — the canonical paren form — OR the natural
1193 // mis-spelling `<name>: <modifiers>` (colon instead of parens). The two
1194 // delimiters are interchangeable for the field head; whichever appears FIRST
1195 // wins, so a paren form whose modifiers contain a colon (`status (enum: a,
1196 // b)`) still parses by parens (the `(` precedes the `:`), while a bare
1197 // `title: string, required` parses by colon instead of being swallowed whole
1198 // into the field name with every modifier silently dropped.
1199 let paren = line.find('(');
1200 let colon = line.find(':');
1201 // Choose the head delimiter. The paren form wins when its `(` precedes any
1202 // `:` (so `status (enum: a, b)` parses by parens, the colon being inside the
1203 // modifiers); otherwise a `:` before the paren — or with no paren at all —
1204 // selects the colon form `<name>: <modifiers>`, the natural mis-spelling that
1205 // must NOT be swallowed whole into the field name with every modifier lost.
1206 let use_paren = matches!((paren, colon), (Some(p), c) if c.is_none_or(|c| p < c));
1207 let (name, modifiers) = if use_paren {
1208 let open = paren.expect("use_paren implies a paren");
1209 let name = line[..open].trim().to_string();
1210 let after = &line[open + 1..];
1211 let mods = match after.rfind(')') {
1212 Some(close) => &after[..close],
1213 None => after, // tolerate a missing close paren
1214 };
1215 (name, mods.trim())
1216 } else if let Some(c) = colon {
1217 // Colon form: everything after the first colon is the modifier list,
1218 // parsed identically to the parenthesized modifiers below.
1219 let name = line[..c].trim().to_string();
1220 (name, line[c + 1..].trim())
1221 } else {
1222 // Neither delimiter: a free-form optional field of any shape — name only.
1223 (line.to_string(), "")
1224 };
1225
1226 let mut spec = FieldSpec {
1227 name,
1228 ..FieldSpec::default()
1229 };
1230
1231 if modifiers.is_empty() {
1232 return spec;
1233 }
1234
1235 // Modifiers are comma-separated. `enum` and `default` are special: their own
1236 // values may contain commas, so each is a *greedy* clause that runs from its
1237 // keyword to the start of the next recognized greedy clause (or end of line).
1238 // This lets `default North America, EMEA fallback` keep its comma and lets a
1239 // `default …` written after an `enum …` still be recognized, instead of the
1240 // value being truncated at the first comma or absorbed into the enum list.
1241 let raw: Vec<&str> = modifiers.split(',').collect();
1242 let mut i = 0;
1243 while i < raw.len() {
1244 let token = raw[i].trim();
1245 if token.is_empty() {
1246 i += 1;
1247 continue;
1248 }
1249 let lower = token.to_ascii_lowercase();
1250
1251 if lower == "required" {
1252 spec.required = true;
1253 i += 1;
1254 } else if let Some(shape) = shape_from_str(&lower) {
1255 spec.shape = Some(shape);
1256 i += 1;
1257 } else if let Some(rest) = lower.strip_prefix("link to ") {
1258 // The trailing slash is required in the source; store the prefix
1259 // without it so `Path::starts_with` comparisons are clean.
1260 let prefix = token["link to ".len()..].trim().trim_end_matches('/');
1261 let _ = rest; // lowercase form only used for the keyword match
1262 spec.link_prefix = Some(PathBuf::from(prefix));
1263 i += 1;
1264 } else if token.len() >= "default ".len() && lower.starts_with("default ") {
1265 // Greedy `default <value>`: the value is this token (after the
1266 // keyword) plus every following comma-token up to the next greedy
1267 // clause, rejoined with the commas the split removed — so a comma
1268 // inside the default value is preserved. Original case is kept.
1269 let end = next_greedy_clause(&raw, i + 1);
1270 let mut value = token["default ".len()..].to_string();
1271 for tok in &raw[i + 1..end] {
1272 value.push(',');
1273 value.push_str(tok);
1274 }
1275 spec.default = Some(Value::String(value.trim().to_string()));
1276 i = end;
1277 } else if lower == "enum" || lower.starts_with("enum:") {
1278 // Greedy `enum` (bare `enum, a, b` or `enum: a, b`): the values run
1279 // from here to the next greedy clause (e.g. a trailing `default …`),
1280 // NOT unconditionally to end-of-line — so a `default` after `enum` is
1281 // parsed instead of swallowed as a bogus enum member.
1282 let end = next_greedy_clause(&raw, i + 1);
1283 // Rejoin this clause's tokens (trimmed so the `enum` head sits at the
1284 // start), drop the leading `enum`/`enum:` head, then re-split the
1285 // remainder into values.
1286 let joined = raw[i..end].join(",");
1287 let joined = joined.trim();
1288 let after_kw = match joined.find(':') {
1289 // `enum: a, b` — values follow the colon.
1290 Some(colon) => &joined[colon + 1..],
1291 // bare `enum, a, b` — values follow the keyword itself.
1292 None => joined.get("enum".len()..).unwrap_or(""),
1293 };
1294 let values: Vec<String> = after_kw
1295 .split(',')
1296 .map(|v| v.trim().to_string())
1297 .filter(|v| !v.is_empty())
1298 .collect();
1299 spec.enum_values = Some(values);
1300 i = end;
1301 } else {
1302 // Unrecognized modifier — captured verbatim, surfaced as Info.
1303 spec.unknown_modifiers.push(token.to_string());
1304 i += 1;
1305 }
1306 }
1307
1308 spec
1309}
1310
1311// ── Private helpers ─────────────────────────────────────────────────────────
1312
1313/// Parse a frontmatter timestamp value into a `DateTime<FixedOffset>`. A `null`
1314/// is treated as absent; anything else must be an RFC3339 string.
1315fn parse_timestamp(
1316 value: &Value,
1317 key: &str,
1318 file: &Path,
1319) -> Result<Option<DateTime<FixedOffset>>, ParseError> {
1320 match value {
1321 Value::Null => Ok(None),
1322 Value::String(s) => parse_rfc3339(s, key, file).map(Some),
1323 other => Err(ParseError::BadTimestamp {
1324 file: file.to_path_buf(),
1325 key: key.to_string(),
1326 value: format!("{other:?}"),
1327 }),
1328 }
1329}
1330
1331/// Parse an RFC3339 timestamp string, mapping failure to [`ParseError::BadTimestamp`].
1332fn parse_rfc3339(s: &str, key: &str, file: &Path) -> Result<DateTime<FixedOffset>, ParseError> {
1333 DateTime::parse_from_rfc3339(s.trim()).map_err(|_| ParseError::BadTimestamp {
1334 file: file.to_path_buf(),
1335 key: key.to_string(),
1336 value: s.to_string(),
1337 })
1338}
1339
1340/// Coerce a YAML scalar value to its string form for the universal-contract
1341/// fields (`type`/`id`/`summary`/`status`). Mirrors `validate::scalar_string`
1342/// and `store::yaml_scalar_string` so the four modules agree on one coercion
1343/// rule: a bare numeric/bool scalar (`id: 100`, `summary: 2026`, `status: 0`)
1344/// is preserved as its string form rather than being read as None and silently
1345/// dropped on the next `to_yaml` re-emit. Returns `None` only for genuinely
1346/// non-scalar values (sequences, mappings, null), which were never a valid
1347/// shape for these fields.
1348fn scalar_string(value: &Value) -> Option<String> {
1349 match value {
1350 Value::String(s) => Some(s.clone()),
1351 Value::Number(n) => Some(n.to_string()),
1352 Value::Bool(b) => Some(b.to_string()),
1353 _ => None,
1354 }
1355}
1356
1357/// Read a `tags` value into a flat `Vec<String>`. Accepts a sequence of scalars
1358/// (the canonical form) or a single scalar (coerced to a one-element list).
1359fn parse_tags(value: &Value) -> Vec<String> {
1360 match value {
1361 Value::Sequence(items) => items
1362 .iter()
1363 .filter_map(|v| match v {
1364 Value::String(s) => Some(s.clone()),
1365 Value::Number(n) => Some(n.to_string()),
1366 Value::Bool(b) => Some(b.to_string()),
1367 _ => None,
1368 })
1369 .collect(),
1370 Value::String(s) => vec![s.clone()],
1371 _ => Vec::new(),
1372 }
1373}
1374
1375/// Read a `tags` value into a flat `Vec<String>` **without losing data**: a
1376/// sequence of clean scalars (the canonical form) or a single scalar coerce to a
1377/// string list. Any other shape — a sequence with a non-scalar item
1378/// (`tags: [[vip]]` → `Seq[Seq[String]]`, `tags: [a, [b]]`), or a mapping — is
1379/// rejected as `Err(value.clone())` so the caller preserves the raw value in
1380/// `extra` rather than silently filtering items out / erasing the field on the
1381/// next re-emit. This is the `tags` analog of routing a non-scalar universal
1382/// value to pass-through instead of the destroy path.
1383fn parse_tags_preserving(value: &Value) -> Result<Vec<String>, Value> {
1384 match value {
1385 Value::Sequence(items) => {
1386 let mut out = Vec::with_capacity(items.len());
1387 for item in items {
1388 match item {
1389 Value::String(s) => out.push(s.clone()),
1390 Value::Number(n) => out.push(n.to_string()),
1391 Value::Bool(b) => out.push(b.to_string()),
1392 // A non-scalar item (nested sequence/mapping/null) means this
1393 // is not a clean tag list; preserve the whole value verbatim.
1394 _ => return Err(value.clone()),
1395 }
1396 }
1397 Ok(out)
1398 }
1399 Value::String(s) => Ok(vec![s.clone()]),
1400 Value::Number(n) => Ok(vec![n.to_string()]),
1401 Value::Bool(b) => Ok(vec![b.to_string()]),
1402 // A mapping / null `tags` value is not a list; preserve it verbatim.
1403 _ => Err(value.clone()),
1404 }
1405}
1406
1407/// Render a non-string YAML mapping key as the scalar text YAML would emit for
1408/// it (`2026`, `true`, `3.14`, …), so a numeric/bool/float frontmatter key
1409/// preserves its key *text* on round-trip instead of being rewritten to its Rust
1410/// `Debug` form (`Number(2026)`, `Bool(true)`, `'Null'`). The key re-emits as a
1411/// string-typed key carrying the original text (`'2026':`) — the type narrows to
1412/// string, but the operator's data is no longer corrupted, and ordinary string
1413/// keys are wholly unaffected. Falls back to `Debug` only for a key shape that
1414/// cannot be a scalar (a sequence/mapping key — not expressible in our
1415/// `String`-keyed `extra`), which never occurs in practice.
1416fn yaml_scalar_key(key: &Value) -> String {
1417 match key {
1418 Value::String(s) => s.clone(),
1419 Value::Number(n) => n.to_string(),
1420 Value::Bool(b) => b.to_string(),
1421 Value::Null => "null".to_string(),
1422 // Non-scalar key: not representable as a plain `extra` string key; keep
1423 // the defensive Debug form so nothing panics (unreachable in practice).
1424 other => format!("{other:?}"),
1425 }
1426}
1427
1428/// Parse a single `[[target|display]]` string into a [`WikiLink`] with no
1429/// location, or `None` if the string is not a bare wiki-link. Used for
1430/// frontmatter-valued links where there is no body position to report.
1431fn parse_wiki_link_str(s: &str) -> Option<WikiLink> {
1432 let s = s.trim();
1433 let inner = s.strip_prefix("[[")?.strip_suffix("]]")?;
1434 // Reject anything with further brackets (e.g. the nested flow-form item),
1435 // which is not a clean single wiki-link.
1436 if inner.contains('[') || inner.contains(']') {
1437 return None;
1438 }
1439 let (target, display) = match inner.split_once('|') {
1440 Some((t, d)) => (t.to_string(), Some(d.to_string())),
1441 None => (inner.to_string(), None),
1442 };
1443 Some(WikiLink {
1444 is_full_path: target_is_full_path(&target),
1445 has_md_extension: target_has_md_extension(&target),
1446 target,
1447 display,
1448 location: (PathBuf::new(), 0, 0),
1449 })
1450}
1451
1452/// Extract every wiki-link from a single frontmatter field value, accepting the
1453/// two canonical forms the spec defines (SPEC § Linking):
1454///
1455/// - a **scalar** wiki-link field, in either the quoted (`f: "[[x]]"`) or the
1456/// canonical unquoted inline (`f: [[x]]`) form, and
1457/// - a **list** field whose items are quoted wiki-link strings
1458/// (`- "[[x]]"`).
1459///
1460/// YAML eats the brackets of an unquoted `[[x]]`, leaving a flow-list-in-a-list,
1461/// so the parsed [`Value`] shapes are not what one would naively expect:
1462///
1463/// | source | parsed `Value` | here |
1464/// |--------------------------------|------------------------------------|------|
1465/// | `f: "[[x]]"` (quoted) | `String("[[x]]")` | link |
1466/// | `f: [[x]]` (unquoted) | `Seq[ Seq[String("x")] ]` | link |
1467/// | `f:`\n` - "[[x]]"`(quoted) | `Seq[ String("[[x]]"), … ]` | link |
1468/// | `f:`\n` - [[x]]` (unquoted) | `Seq[ Seq[Seq[String("x")]], … ]` | — |
1469///
1470/// The last row — an *unquoted list* — parses identically to the flow-form list
1471/// `f: [[a], [b]]` and is a mis-encoding the canonical writer never emits;
1472/// `dbmd validate` reports it as `WIKI_LINK_FLOW_FORM_LIST` (see
1473/// [`detect_flow_form_link_lists`]). It is deliberately NOT surfaced here, so an
1474/// edge enumerator only ever sees the valid canonical forms.
1475///
1476/// The unquoted scalar (`Seq[Seq[String]]`, one element) is told apart from a
1477/// plain one-item flow list (`f: [x]` → `Seq[String]`, one fewer nesting level)
1478/// by [`unquoted_inline_link`] requiring its argument to be a `Sequence`.
1479fn links_in_field_value(value: &Value) -> Vec<WikiLink> {
1480 // Quoted scalar: `field: "[[x]]"`.
1481 if let Value::String(s) = value {
1482 return parse_wiki_link_str(s).into_iter().collect();
1483 }
1484 let Value::Sequence(items) = value else {
1485 return Vec::new();
1486 };
1487 // Unquoted scalar inline form `field: [[x]]` → `Seq[ Seq[String(x)] ]`.
1488 // (A quoted single-item list `["[[x]]"]` is `Seq[String]`, so its lone item
1489 // is a `String`, not a `Sequence`, and falls through to the list path below.)
1490 if items.len() == 1 {
1491 if let Some(link) = unquoted_inline_link(&items[0]) {
1492 return vec![link];
1493 }
1494 }
1495 // Otherwise a list of quoted wiki-link strings; non-string items (the
1496 // unquoted-list mis-encoding) are left for validate to flag.
1497 items
1498 .iter()
1499 .filter_map(|item| parse_wiki_link_str(item.as_str()?))
1500 .collect()
1501}
1502
1503/// Canonicalize one `extra` frontmatter value for emission by [`Frontmatter::to_yaml`].
1504///
1505/// The read path ([`Frontmatter::parse`]) stores every unknown key's raw parsed
1506/// [`Value`] verbatim, so a SPEC-canonical *unquoted* inline scalar wiki-link
1507/// (`company: [[records/companies/northstar]]`) lands in `extra` as the nested
1508/// shape YAML produces for it — `Seq[ Seq[String("records/companies/northstar")] ]`.
1509/// Re-emitting that verbatim yields the block sequence
1510///
1511/// ```text
1512/// company:
1513/// - - records/companies/northstar
1514/// ```
1515///
1516/// which has lost the `[[ ]]` brackets entirely: the link is destroyed, and every
1517/// reader (validate, graph, backlinks) stops seeing the edge. This normalizes such
1518/// a value back into the canonical emitted form before it is written:
1519///
1520/// - a **scalar** wiki-link (quoted `String("[[x]]")` or unquoted `Seq[Seq[String]]`,
1521/// one element) → a quoted scalar `Value::String("[[x]]")`, which serde_norway emits
1522/// inline as `'[[x]]'` — the form the finding confirms survives a round-trip and
1523/// that [`links_in_field_value`] reads back as the same scalar link;
1524/// - a **list** of wiki-links (in any spelling [`links_in_field_value`] accepts) →
1525/// a block `Value::Sequence` of quoted-link strings (`- "[[x]]"`), matching the
1526/// `set` write-in path and the canonical list form;
1527/// - everything else → returned verbatim (the common no-op for non-link values).
1528///
1529/// `|display` is preserved in both link branches. This is the single point that
1530/// keeps all three curator-loop writers (`format`, `fm set`, `link`) from
1531/// corrupting a pre-existing canonical link, since they all funnel through
1532/// `to_yaml`.
1533fn canonicalize_extra_value(value: &Value) -> Value {
1534 match value {
1535 // Scalar wiki-link, quoted form: `field: "[[x]]"` → `String("[[x]]")`.
1536 // Re-emit as a quoted scalar so it stays a string (never the brackets-as-
1537 // YAML nested sequence). Non-link strings are returned untouched.
1538 Value::String(s) => match parse_wiki_link_str(s) {
1539 Some(link) => Value::String(wiki_link_literal(&link)),
1540 None => value.clone(),
1541 },
1542 Value::Sequence(items) => {
1543 // Scalar wiki-link, unquoted inline form: `field: [[x]]` parses to a
1544 // one-element `Seq[ Seq[String(x)] ]`. Collapse back to the quoted
1545 // scalar string so the link is preserved rather than block-emitted.
1546 if items.len() == 1 {
1547 if let Some(link) = unquoted_inline_link(&items[0]) {
1548 return Value::String(wiki_link_literal(&link));
1549 }
1550 }
1551 // List of wiki-links: re-emit as a block sequence of quoted-link
1552 // strings, the canonical list form `to_yaml` renders block-style and
1553 // `links_in_field_value` accepts. Only canonicalize when *every* item
1554 // is a clean single wiki-link; a list with any non-link item is left
1555 // verbatim so unrelated sequences (and the unquoted-list mis-encoding
1556 // validate flags) are untouched.
1557 let mut links = Vec::with_capacity(items.len());
1558 for item in items {
1559 match link_from_flow_list_item(item) {
1560 Some(link) => links.push(link),
1561 None => return value.clone(),
1562 }
1563 }
1564 if links.is_empty() {
1565 return value.clone();
1566 }
1567 Value::Sequence(
1568 links
1569 .iter()
1570 .map(|l| Value::String(wiki_link_literal(l)))
1571 .collect(),
1572 )
1573 }
1574 // Mappings, scalars other than strings, nulls: nothing to canonicalize.
1575 _ => value.clone(),
1576 }
1577}
1578
1579/// Render a [`WikiLink`] back to its `[[target]]` / `[[target|display]]` literal,
1580/// the inner form the canonical writer emits and `links_in_field_value` accepts.
1581fn wiki_link_literal(link: &WikiLink) -> String {
1582 match &link.display {
1583 Some(d) => format!("[[{}|{}]]", link.target, d),
1584 None => format!("[[{}]]", link.target),
1585 }
1586}
1587
1588/// Recognize the inner token of an unquoted scalar `[[x]]`: after YAML strips the
1589/// outer brackets, the inner `[x]` is a single-element sequence `Seq[String(x)]`.
1590/// Reconstructs `[[x]]` (preserving any `|display`) and parses it, or returns
1591/// `None` when `v` is not that shape. Requiring a `Sequence` here is what keeps a
1592/// plain one-item flow list (`field: [x]` → `Seq[String]`, not `Seq[Seq[String]]`)
1593/// from being mistaken for a wiki-link.
1594fn unquoted_inline_link(v: &Value) -> Option<WikiLink> {
1595 let Value::Sequence(items) = v else {
1596 return None;
1597 };
1598 if items.len() != 1 {
1599 return None;
1600 }
1601 let s = items[0].as_str()?;
1602 // A clean unquoted wiki-link has no further brackets inside it.
1603 if s.contains('[') || s.contains(']') {
1604 return None;
1605 }
1606 parse_wiki_link_str(&format!("[[{s}]]"))
1607}
1608
1609/// Decide whether a `dbmd fm set` / `--fm` value string is a **list of
1610/// wiki-links** that should be stored as a YAML block sequence, returning the
1611/// canonical `Value::Sequence` of quoted-link strings when so.
1612///
1613/// The value path of every write surface stringifies its argument; without this
1614/// a required list-of-links field (`meeting.attendees`) was unwritable in valid
1615/// form — passing `[[[a]], [[b]]]` stored a single scalar string that mis-parses
1616/// and trips `WIKI_LINK_FLOW_FORM_LIST` / `WIKI_LINK_BROKEN`. This recognizes the
1617/// two list spellings an agent naturally types and normalizes both to the block
1618/// form the canonical writer emits and `dbmd validate` accepts:
1619///
1620/// - flow list of quoted links — `["[[a]]", "[[b]]"]`
1621/// - flow list of unquoted links — `[[[a]], [[b]]]` (YAML: `Seq[Seq[String], …]`)
1622///
1623/// Returns `None` (⇒ caller stores a verbatim scalar string) for everything that
1624/// is not unambiguously a list of clean wiki-links — plain text, a single inline
1625/// `[[x]]` (YAML reads it as a one-item `Seq[Seq[String]]`, kept scalar so it
1626/// renders inline), an empty list, or a list with any non-link item. A single
1627/// link must stay scalar; only genuine multi-item-or-explicit lists become
1628/// sequences, matching `links_in_field_value`'s acceptance rule so writer and
1629/// validator never disagree.
1630fn parse_link_list_value(value: &str) -> Option<Value> {
1631 let trimmed = value.trim();
1632 // Only a YAML *flow sequence* literal is a list candidate; anything not
1633 // wrapped in `[ … ]` is a scalar (a bare `[[x]]` is wrapped, and handled by
1634 // the single-inline-link guard below).
1635 if !(trimmed.starts_with('[') && trimmed.ends_with(']')) {
1636 return None;
1637 }
1638 let Ok(Value::Sequence(items)) = serde_norway::from_str::<Value>(trimmed) else {
1639 return None;
1640 };
1641 // A single inline `[[x]]` parses to `Seq[ Seq[String(x)] ]` (one item, itself
1642 // a sequence) — that is the unquoted *scalar* form, not a list. Keep it scalar
1643 // so it round-trips to the inline `field: [[x]]` rather than a one-item block
1644 // list. `links_in_field_value` reads it back as a scalar link either way.
1645 if items.len() == 1 && unquoted_inline_link(&items[0]).is_some() {
1646 return None;
1647 }
1648 // Every item must resolve to exactly one clean wiki-link, in any of the flow
1649 // spellings an agent types (see [`link_from_flow_list_item`]).
1650 let mut links = Vec::with_capacity(items.len());
1651 for item in &items {
1652 links.push(link_from_flow_list_item(item)?);
1653 }
1654 if links.is_empty() {
1655 return None;
1656 }
1657 // Normalize to a block sequence of quoted-link strings — the form `to_yaml`
1658 // renders block-style and `links_in_field_value` accepts. `|display` is
1659 // preserved.
1660 let normalized = links
1661 .iter()
1662 .map(|l| Value::String(wiki_link_literal(l)))
1663 .collect();
1664 Some(Value::Sequence(normalized))
1665}
1666
1667/// Recognize one clean wiki-link from a single **item** of a YAML flow sequence,
1668/// across the spellings an agent types for a list. After top-level flow parsing,
1669/// a list item arrives in one of:
1670///
1671/// - quoted — `"[[x]]"` ⇒ `String("[[x]]")`
1672/// - unquoted in a flow list — `[[x]]` inside `[…]` ⇒ `Seq[ Seq[String(x)] ]`
1673/// (one level deeper than a bare unquoted scalar, because the surrounding list
1674/// adds a wrapper); unwrap the single-element wrapper, then read the inline
1675/// `Seq[String(x)]` with [`unquoted_inline_link`].
1676///
1677/// Returns `None` for any item that is not exactly one clean wiki-link, so the
1678/// caller falls back to a scalar string and never fabricates a partial list.
1679fn link_from_flow_list_item(item: &Value) -> Option<WikiLink> {
1680 match item {
1681 Value::String(s) => parse_wiki_link_str(s),
1682 Value::Sequence(inner) => {
1683 // Unquoted list item `[[x]]` → `Seq[ Seq[String(x)] ]`: peel the lone
1684 // wrapper to expose the inline-link shape `Seq[String(x)]`.
1685 //
1686 // Only this triple-nested shape is a wiki-link. We deliberately do
1687 // NOT fall back to `unquoted_inline_link(item)` on the bare double
1688 // nesting `Seq[String(x)]` (a plain one-element string list `[x]`):
1689 // that fallback fabricated a wiki-link out of an ordinary nested
1690 // string list — `groups: [[alpha], [beta]]` (data `[["alpha"],
1691 // ["beta"]]`) was rewritten to `- '[[alpha]]'` / `- '[[beta]]'`,
1692 // silently changing the field's type and manufacturing short-form
1693 // links the tool then flags as `WIKI_LINK_SHORT_FORM`. An unknown
1694 // nested string list must pass through verbatim (SPEC § "Unknown
1695 // fields pass through").
1696 if inner.len() == 1 {
1697 if let Some(link) = unquoted_inline_link(&inner[0]) {
1698 return Some(link);
1699 }
1700 }
1701 None
1702 }
1703 _ => None,
1704 }
1705}
1706
1707/// A target is a full store-relative path when its first path segment is one of
1708/// the three canonical layer dirs and at least one `/` separator follows. A
1709/// trailing `.md` does not affect this classification.
1710fn target_is_full_path(target: &str) -> bool {
1711 let target = target.trim();
1712 match target.split_once('/') {
1713 Some((head, _rest)) => LAYER_DIRS.contains(&head),
1714 None => false,
1715 }
1716}
1717
1718/// True when the target carries a trailing `.md` extension (validate warns
1719/// `WIKI_LINK_HAS_EXTENSION`).
1720fn target_has_md_extension(target: &str) -> bool {
1721 target.trim().ends_with(".md")
1722}
1723
1724/// 1-based character (Unicode scalar) column of `byte_offset` within `line`.
1725fn char_column(line: &str, byte_offset: usize) -> u32 {
1726 (line[..byte_offset].chars().count() as u32) + 1
1727}
1728
1729/// Index of the first comma-token in `raw[from..]` that *starts a greedy
1730/// modifier clause* (`enum`, `enum:…`, or `default …`), or `raw.len()` when none
1731/// remain. Used to bound a greedy `default`/`enum` value so it stops at the next
1732/// such clause instead of either truncating at the first comma or swallowing a
1733/// following greedy clause whole.
1734fn next_greedy_clause(raw: &[&str], from: usize) -> usize {
1735 let mut j = from;
1736 while j < raw.len() {
1737 let lower = raw[j].trim().to_ascii_lowercase();
1738 if lower == "enum" || lower.starts_with("enum:") || lower.starts_with("default ") {
1739 return j;
1740 }
1741 j += 1;
1742 }
1743 raw.len()
1744}
1745
1746/// Map a lowercase shape keyword to its [`Shape`].
1747fn shape_from_str(s: &str) -> Option<Shape> {
1748 match s {
1749 "string" => Some(Shape::String),
1750 "int" => Some(Shape::Int),
1751 "bool" => Some(Shape::Bool),
1752 "date" => Some(Shape::Date),
1753 "email" => Some(Shape::Email),
1754 "currency" => Some(Shape::Currency),
1755 "url" => Some(Shape::Url),
1756 _ => None,
1757 }
1758}
1759
1760/// The ATX heading level of a line (number of leading `#`), or 0 if not a
1761/// heading. Up to three leading spaces (CommonMark), requires a space/tab (or
1762/// end-of-line) after the `#` run, caps the run at six.
1763fn heading_level(line: &str) -> u8 {
1764 let indent = line.len() - line.trim_start_matches(' ').len();
1765 if indent > 3 {
1766 return 0;
1767 }
1768 let rest = &line[indent..];
1769 let hashes = rest.len() - rest.trim_start_matches('#').len();
1770 if hashes == 0 || hashes > 6 {
1771 return 0;
1772 }
1773 let after = &rest[hashes..];
1774 if after.is_empty() || after.starts_with(' ') || after.starts_with('\t') {
1775 hashes as u8
1776 } else {
1777 0
1778 }
1779}
1780
1781/// The heading text after the `#` run, trimmed, with a trailing ATX *closing*
1782/// `#` sequence removed per CommonMark (`## Title ##` → `Title`).
1783///
1784/// CommonMark only treats a trailing run of `#` as a closing sequence when it is
1785/// **preceded by a space or tab** (or the content is empty). A `#` that abuts the
1786/// preceding word is literal heading text: `## C#` → `C#`, `## F#` → `F#`,
1787/// `## issue-123#` → `issue-123#`. The old unconditional `trim_end_matches('#')`
1788/// stripped those, corrupting `dbmd sections`/`outline` heading text and — via
1789/// `parse_db_md` using the heading verbatim as the schema type key — silently
1790/// binding a `### c#` schema to `type: c` instead of `type: c#`.
1791fn heading_text(line: &str, level: u8) -> String {
1792 let indent = line.len() - line.trim_start_matches(' ').len();
1793 let after_hashes = &line[indent + level as usize..];
1794 let trimmed = after_hashes.trim();
1795
1796 // Peel a trailing run of `#`. It is a closing sequence only if what precedes
1797 // it (within `trimmed`) is empty or ends in a space/tab; otherwise the `#`s
1798 // are literal content.
1799 let without_hashes = trimmed.trim_end_matches('#');
1800 if without_hashes.len() == trimmed.len() {
1801 // No trailing `#` at all.
1802 return trimmed.to_string();
1803 }
1804 if without_hashes.is_empty() || without_hashes.ends_with([' ', '\t']) {
1805 // A genuine closing sequence (`## Title ##`, `## ##`): drop it and the
1806 // whitespace before it.
1807 without_hashes.trim_end().to_string()
1808 } else {
1809 // The `#` run abuts content (`## C#`): keep it as literal heading text.
1810 trimmed.to_string()
1811 }
1812}
1813
1814/// If `line` opens a fenced code block, return `(fence byte, run length)`.
1815fn opening_fence(line: &str) -> Option<(u8, usize)> {
1816 let indent = line.len() - line.trim_start_matches(' ').len();
1817 if indent > 3 {
1818 return None;
1819 }
1820 let rest = &line[indent..];
1821 let byte = rest.bytes().next()?;
1822 if byte != b'`' && byte != b'~' {
1823 return None;
1824 }
1825 let run = rest.len() - rest.trim_start_matches(byte as char).len();
1826 if run < 3 {
1827 return None;
1828 }
1829 // A backtick fence's info string may not itself contain a backtick.
1830 if byte == b'`' && rest[run..].contains('`') {
1831 return None;
1832 }
1833 Some((byte, run))
1834}
1835
1836/// True if `line` closes the currently open fence: same char, run at least as
1837/// long, nothing but trailing whitespace after.
1838fn is_closing_fence(line: &str, fence: (u8, usize)) -> bool {
1839 let (byte, open_len) = fence;
1840 let indent = line.len() - line.trim_start_matches(' ').len();
1841 if indent > 3 {
1842 return false;
1843 }
1844 let rest = &line[indent..];
1845 let run = rest.len() - rest.trim_start_matches(byte as char).len();
1846 if run < open_len {
1847 return false;
1848 }
1849 rest[run..].trim().is_empty()
1850}
1851
1852/// The prose body of a section: everything after the heading line, trimmed.
1853fn section_prose(section_body: &str) -> String {
1854 match section_body.split_once('\n') {
1855 Some((_heading, rest)) => rest.trim().to_string(),
1856 None => String::new(),
1857 }
1858}
1859
1860/// The bullet lines (`-`/`*`/`+`) of a section body, excluding the heading
1861/// line, each returned with its leading whitespace trimmed.
1862fn bullet_lines(section_body: &str) -> Vec<String> {
1863 section_body
1864 .lines()
1865 .skip(1) // the heading line
1866 .map(str::trim)
1867 .filter(|l| l.starts_with("- ") || l.starts_with("* ") || l.starts_with("+ "))
1868 .map(|l| l.to_string())
1869 .collect()
1870}
1871
1872/// Cut a bullet's content at the first comment separator, returning only the
1873/// meaningful prefix. Recognizes the em-dash (` — `), en-dash (` – `), double-
1874/// hyphen (` -- `), and the plain single-ASCII-hyphen (` - `) spellings an
1875/// operator naturally types — without the single-hyphen form, a comment like
1876/// `records/decisions/q3.md - finalized` left the whole line (comment included)
1877/// as the frozen path, so the entry never matched and the freeze failed OPEN.
1878/// A store-relative path never contains a ` - ` (paths are `/`-joined, spaceless),
1879/// so this does not truncate legitimate path text.
1880fn strip_bullet_comment(content: &str) -> &str {
1881 let mut cut = content.len();
1882 for sep in [" — ", " -- ", " – ", " - "] {
1883 if let Some(idx) = content.find(sep) {
1884 cut = cut.min(idx);
1885 }
1886 }
1887 content[..cut].trim()
1888}
1889
1890/// Strip the leading bullet marker, returning the trimmed content after it.
1891fn bullet_content(bullet: &str) -> &str {
1892 let t = bullet.trim();
1893 t.strip_prefix("- ")
1894 .or_else(|| t.strip_prefix("* "))
1895 .or_else(|| t.strip_prefix("+ "))
1896 .unwrap_or(t)
1897 .trim()
1898}
1899
1900/// Extract a store-relative path from a Frozen-pages bullet. The path may be
1901/// wrapped in backticks and followed by an em-dash comment.
1902fn extract_path_bullet(bullet: &str) -> String {
1903 let content = bullet_content(bullet);
1904 // Prefer a backtick-delimited span if present.
1905 if let Some(start) = content.find('`') {
1906 if let Some(end_rel) = content[start + 1..].find('`') {
1907 return content[start + 1..start + 1 + end_rel].trim().to_string();
1908 }
1909 }
1910 // Otherwise take the text up to a comment separator, stripping quotes.
1911 strip_bullet_comment(content)
1912 .trim_matches('"')
1913 .trim_matches('\'')
1914 .trim()
1915 .to_string()
1916}
1917
1918/// Extract a comma-separated type list from an Ignored-types bullet, stripping
1919/// backticks/quotes and any trailing em-dash comment.
1920fn extract_type_list_bullet(bullet: &str) -> Vec<String> {
1921 let content = strip_bullet_comment(bullet_content(bullet));
1922 content
1923 .split(',')
1924 .map(|t| {
1925 t.trim()
1926 .trim_matches('`')
1927 .trim_matches('"')
1928 .trim_matches('\'')
1929 .trim()
1930 .to_string()
1931 })
1932 .filter(|t| !t.is_empty())
1933 .collect()
1934}
1935
1936#[cfg(test)]
1937mod tests {
1938 use super::*;
1939 use std::path::Path;
1940 use tempfile::tempdir;
1941
1942 // ── Config::frozen_match (the single write-surface policy matcher) ───────
1943
1944 #[test]
1945 fn frozen_match_is_md_insensitive_both_directions() {
1946 // A policy entry stored WITHOUT `.md` (the natural extensionless
1947 // spelling `parse_db_md` keeps verbatim) must still match a `.md`
1948 // write target — the regression every write surface had.
1949 let cfg = Config {
1950 frozen_pages: vec![PathBuf::from("records/decisions/q1")],
1951 ..Config::default()
1952 };
1953 assert_eq!(
1954 cfg.frozen_match(Path::new("records/decisions/q1.md")),
1955 Some(PathBuf::from("records/decisions/q1")),
1956 "extensionless policy entry must freeze the .md file"
1957 );
1958 assert!(cfg.is_frozen(Path::new("records/decisions/q1.md")));
1959
1960 // The symmetric case: a policy entry WITH `.md` matches a bare target.
1961 let cfg = Config {
1962 frozen_pages: vec![PathBuf::from("records/decisions/q1.md")],
1963 ..Config::default()
1964 };
1965 assert_eq!(
1966 cfg.frozen_match(Path::new("records/decisions/q1")),
1967 Some(PathBuf::from("records/decisions/q1.md")),
1968 );
1969 // And the same-spelling cases still match.
1970 assert!(cfg.is_frozen(Path::new("records/decisions/q1.md")));
1971 }
1972
1973 #[test]
1974 fn frozen_match_drops_leading_dot_slash() {
1975 let cfg = Config {
1976 frozen_pages: vec![PathBuf::from("records/decisions/q1.md")],
1977 ..Config::default()
1978 };
1979 assert!(cfg.is_frozen(Path::new("./records/decisions/q1.md")));
1980 assert!(cfg.is_frozen(Path::new("./records/decisions/q1")));
1981 }
1982
1983 #[test]
1984 fn frozen_match_returns_none_for_unlisted_and_prefix_paths() {
1985 let cfg = Config {
1986 frozen_pages: vec![PathBuf::from("records/decisions/q1")],
1987 ..Config::default()
1988 };
1989 assert!(cfg
1990 .frozen_match(Path::new("records/decisions/q2.md"))
1991 .is_none());
1992 // A prefix is not a match: `q1` must not freeze `q1-draft`.
1993 assert!(cfg
1994 .frozen_match(Path::new("records/decisions/q1-draft.md"))
1995 .is_none());
1996 assert!(!cfg.is_frozen(Path::new("records/decisions/q11.md")));
1997 }
1998
1999 // ── split_frontmatter ───────────────────────────────────────────────────
2000
2001 #[test]
2002 fn split_frontmatter_separates_yaml_and_verbatim_body() {
2003 let text = "---\ntype: contact\nsummary: x\n---\n# Heading\n\nBody line.\n";
2004 let p = split_frontmatter(text, Path::new("f.md")).unwrap();
2005 assert_eq!(p.frontmatter_yaml, "type: contact\nsummary: x\n");
2006 // Body is everything after the closing fence's newline, byte-for-byte.
2007 assert_eq!(p.body, "# Heading\n\nBody line.\n");
2008 }
2009
2010 #[test]
2011 fn split_frontmatter_preserves_body_without_trailing_newline() {
2012 let text = "---\ntype: x\n---\nno trailing newline";
2013 let p = split_frontmatter(text, Path::new("f.md")).unwrap();
2014 assert_eq!(p.body, "no trailing newline");
2015 }
2016
2017 #[test]
2018 fn split_frontmatter_empty_body_when_nothing_after_fence() {
2019 let text = "---\ntype: x\n---\n";
2020 let p = split_frontmatter(text, Path::new("f.md")).unwrap();
2021 assert_eq!(p.body, "");
2022 }
2023
2024 #[test]
2025 fn split_frontmatter_missing_opening_fence_errors() {
2026 let text = "# No frontmatter here\ntype: x\n";
2027 let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
2028 assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
2029 }
2030
2031 #[test]
2032 fn split_frontmatter_leading_content_before_fence_rejected() {
2033 // The opening fence must be the very first line; a blank line first is
2034 // not allowed.
2035 let text = "\n---\ntype: x\n---\nbody";
2036 let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
2037 assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
2038 }
2039
2040 #[test]
2041 fn split_frontmatter_unterminated_block_errors() {
2042 let text = "---\ntype: x\nsummary: y\n";
2043 let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
2044 assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
2045 }
2046
2047 // ── Frontmatter::parse ───────────────────────────────────────────────────
2048
2049 #[test]
2050 fn parse_populates_typed_fields_and_routes_unknowns_to_extra() {
2051 let yaml = "type: contact\nid: sarah-chen\nsummary: Director of Ops\nstatus: active\ntags: [vip, renewal]\nemail: sarah@northstar.io\nrole: Director";
2052 let fm = Frontmatter::parse(yaml, Path::new("f.md")).unwrap();
2053 assert_eq!(fm.type_.as_deref(), Some("contact"));
2054 assert_eq!(fm.id.as_deref(), Some("sarah-chen"));
2055 assert_eq!(fm.summary.as_deref(), Some("Director of Ops"));
2056 assert_eq!(fm.status.as_deref(), Some("active"));
2057 assert_eq!(fm.tags, vec!["vip".to_string(), "renewal".to_string()]);
2058 // Type-specific fields are NOT promoted to typed slots.
2059 assert!(fm.type_.is_some() && !fm.extra.contains_key("type"));
2060 assert!(!fm.extra.contains_key("tags"));
2061 assert_eq!(
2062 fm.extra.get("email").and_then(|v| v.as_str()),
2063 Some("sarah@northstar.io")
2064 );
2065 assert_eq!(
2066 fm.extra.get("role").and_then(|v| v.as_str()),
2067 Some("Director")
2068 );
2069 }
2070
2071 #[test]
2072 fn parse_reads_rfc3339_timestamps() {
2073 let yaml =
2074 "type: email\ncreated: 2026-05-27T08:00:00-07:00\nupdated: 2026-05-28T09:30:00-07:00";
2075 let fm = Frontmatter::parse(yaml, Path::new("f.md")).unwrap();
2076 let created = fm.created.expect("created parsed");
2077 // -07:00 offset is 7 * 3600 seconds west.
2078 assert_eq!(created.offset().utc_minus_local(), 7 * 3600);
2079 assert_eq!(created.to_rfc3339(), "2026-05-27T08:00:00-07:00");
2080 assert!(fm.updated.is_some());
2081 }
2082
2083 #[test]
2084 fn parse_rejects_non_rfc3339_timestamp() {
2085 // A date-only value is not a full RFC3339 timestamp; created/updated
2086 // require the full form.
2087 let yaml = "type: email\ncreated: 2026-05-27";
2088 let err = Frontmatter::parse(yaml, Path::new("bad.md")).unwrap_err();
2089 match err {
2090 ParseError::BadTimestamp { key, value, .. } => {
2091 assert_eq!(key, "created");
2092 assert_eq!(value, "2026-05-27");
2093 }
2094 other => panic!("expected BadTimestamp, got {other:?}"),
2095 }
2096 }
2097
2098 #[test]
2099 fn parse_malformed_yaml_errors() {
2100 // Unclosed flow mapping is invalid YAML.
2101 let yaml = "type: contact\n bad: : :\n- nope";
2102 let err = Frontmatter::parse(yaml, Path::new("bad.md")).unwrap_err();
2103 assert!(matches!(err, ParseError::MalformedYaml { .. }));
2104 }
2105
2106 #[test]
2107 fn frontmatter_with_yaml_tag_on_mapping_does_not_panic() {
2108 // Regression: a YAML tag on the top-level mapping made the old
2109 // `expect_err` path PANIC, because a tagged mapping deserializes to a
2110 // `Mapping` just fine. It must now be handled — accepted as the inner
2111 // mapping, never a panic.
2112 let fm = Frontmatter::parse("!mytag\ntype: contact\nsummary: hi\n", Path::new("x.md"))
2113 .expect("tagged-mapping frontmatter must parse, not panic");
2114 assert_eq!(fm.type_.as_deref(), Some("contact"));
2115 // A genuine scalar/sequence top level is still malformed (and still
2116 // doesn't panic).
2117 assert!(Frontmatter::parse("- a\n- b\n", Path::new("x.md")).is_err());
2118 }
2119
2120 #[test]
2121 fn parse_empty_block_is_empty_frontmatter() {
2122 let fm = Frontmatter::parse("", Path::new("f.md")).unwrap();
2123 assert_eq!(fm, Frontmatter::default());
2124 }
2125
2126 #[test]
2127 fn parse_scalar_top_level_is_malformed() {
2128 // A bare scalar at the top level is not a frontmatter mapping.
2129 let err = Frontmatter::parse("just a string", Path::new("f.md")).unwrap_err();
2130 assert!(matches!(err, ParseError::MalformedYaml { .. }));
2131 }
2132
2133 // ── to_yaml canonical order ──────────────────────────────────────────────
2134
2135 #[test]
2136 fn to_yaml_emits_canonical_key_order() {
2137 let mut fm = Frontmatter {
2138 type_: Some("contact".into()),
2139 id: Some("sarah-chen".into()),
2140 summary: Some("Director of Ops".into()),
2141 status: Some("active".into()),
2142 tags: vec!["vip".into()],
2143 created: Some(DateTime::parse_from_rfc3339("2026-05-27T08:00:00-07:00").unwrap()),
2144 updated: Some(DateTime::parse_from_rfc3339("2026-05-28T09:30:00-07:00").unwrap()),
2145 ..Default::default()
2146 };
2147 // Two type-specific fields, inserted in NON-alphabetical order to prove
2148 // the writer sorts them (BTreeMap) between the universal head and tail.
2149 fm.extra
2150 .insert("role".into(), Value::String("Director".into()));
2151 fm.extra.insert(
2152 "company".into(),
2153 Value::String("[[records/companies/northstar]]".into()),
2154 );
2155
2156 let yaml = fm.to_yaml();
2157 let keys: Vec<&str> = yaml
2158 .lines()
2159 .filter(|l| !l.starts_with(['-', ' ']) && l.contains(':'))
2160 .map(|l| l.split(':').next().unwrap())
2161 .collect();
2162 assert_eq!(
2163 keys,
2164 vec![
2165 "type", "id", "created", "updated", "summary", // universal head
2166 "company", "role", // type-specific, sorted
2167 "status", // universal tail
2168 "tags",
2169 ],
2170 "canonical order violated; got:\n{yaml}"
2171 );
2172 // Timestamps round-trip as RFC3339 strings (YAML may quote them).
2173 assert!(
2174 yaml.contains("2026-05-27T08:00:00-07:00"),
2175 "created timestamp missing; got:\n{yaml}"
2176 );
2177 // The value re-parses to the same instant regardless of quoting.
2178 let reparsed = Frontmatter::parse(&yaml, Path::new("rt.md")).unwrap();
2179 assert_eq!(reparsed.created, fm.created);
2180 assert_eq!(reparsed.updated, fm.updated);
2181 }
2182
2183 #[test]
2184 fn to_yaml_omits_absent_optional_fields() {
2185 let fm = Frontmatter {
2186 type_: Some("note".into()),
2187 ..Default::default()
2188 };
2189 let yaml = fm.to_yaml();
2190 assert!(yaml.contains("type: note"));
2191 assert!(!yaml.contains("status"));
2192 assert!(!yaml.contains("tags"));
2193 assert!(!yaml.contains("summary"));
2194 }
2195
2196 // ── Regression: non-string scalar universal fields round-trip (finding #1) ─
2197
2198 #[test]
2199 fn regression_parse_preserves_non_string_scalar_universal_fields() {
2200 // A hand/externally-authored file whose universal fields are bare
2201 // scalars YAML reads as Number/Bool — `id: 100`, `summary: 2026`,
2202 // `status: 0`, `type: 42` — must be PRESERVED as their string form, not
2203 // read as None. Before the fix, `v.as_str()` returned None for these and
2204 // the matched arm discarded the value entirely (never reaching `extra`).
2205 let yaml = "type: 42\nid: 100\nsummary: 2026\nstatus: 0";
2206 let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
2207 assert_eq!(fm.type_.as_deref(), Some("42"), "type scalar dropped");
2208 assert_eq!(fm.id.as_deref(), Some("100"), "id scalar dropped");
2209 assert_eq!(
2210 fm.summary.as_deref(),
2211 Some("2026"),
2212 "summary scalar dropped"
2213 );
2214 assert_eq!(fm.status.as_deref(), Some("0"), "status scalar dropped");
2215 // The values must surface through the public `get` accessor too.
2216 assert_eq!(
2217 fm.get("summary")
2218 .and_then(|v| v.as_str().map(str::to_string)),
2219 Some("2026".to_string())
2220 );
2221 }
2222
2223 #[test]
2224 fn regression_format_round_trip_does_not_delete_numeric_frontmatter() {
2225 // The exact finding-#1 trigger: `dbmd format` is read_file -> write_file.
2226 // A file whose `id`/`summary`/`status` are bare numeric scalars must
2227 // still carry those fields after the canonical re-emit. Before the fix,
2228 // the lines were silently deleted from disk (only `type` survived).
2229 let dir = tempdir().unwrap();
2230 let path = dir.path().join("x.md");
2231 let original = "---\ntype: contact\nid: 100\nsummary: 2026\nstatus: 0\n---\nbody\n";
2232 std::fs::write(&path, original).unwrap();
2233
2234 // Re-emit through the canonical writer, exactly as `dbmd format` does.
2235 let (fm, body) = read_file(&path).unwrap();
2236 write_file(&path, &fm, &body).unwrap();
2237
2238 let after = std::fs::read_to_string(&path).unwrap();
2239 // None of the four fields may vanish; they survive as string scalars.
2240 let reparsed = Frontmatter::parse(
2241 &split_frontmatter(&after, &path).unwrap().frontmatter_yaml,
2242 &path,
2243 )
2244 .unwrap();
2245 assert_eq!(reparsed.type_.as_deref(), Some("contact"));
2246 assert_eq!(reparsed.id.as_deref(), Some("100"), "id deleted by format");
2247 assert_eq!(
2248 reparsed.summary.as_deref(),
2249 Some("2026"),
2250 "summary deleted by format"
2251 );
2252 assert_eq!(
2253 reparsed.status.as_deref(),
2254 Some("0"),
2255 "status deleted by format"
2256 );
2257 // The body is preserved verbatim.
2258 assert_eq!(body, "body\n");
2259 }
2260
2261 // ── Regression: BOM-prefixed files parse like store/index (finding #19) ────
2262
2263 #[test]
2264 fn regression_split_frontmatter_tolerates_leading_utf8_bom() {
2265 // A BOM-prefixed file (EF BB BF + `---\n...`) is walked and indexed by
2266 // `dbmd index` (store/index strip the BOM) but, before the fix, every
2267 // write/edit surface routed through `read_file` hard-failed with
2268 // MissingFrontmatter. `split_frontmatter` must now strip a single leading
2269 // U+FEFF and emit a BOM-free body.
2270 let text = "\u{feff}---\ntype: note\nsummary: x\n---\nbody\n";
2271 let parsed = split_frontmatter(text, Path::new("note.md")).unwrap();
2272 assert_eq!(parsed.frontmatter_yaml, "type: note\nsummary: x\n");
2273 // Body never carries the BOM forward into the canonical writer.
2274 assert_eq!(parsed.body, "body\n");
2275 assert!(!parsed.body.starts_with('\u{feff}'));
2276 }
2277
2278 #[test]
2279 fn regression_read_file_parses_bom_prefixed_file() {
2280 // End-to-end through the same `read_file` path `dbmd fm get/set`,
2281 // `format`, `link`, and `write` use. Before the fix this returned
2282 // Err(MissingFrontmatter) on a file the catalog had already indexed.
2283 let dir = tempdir().unwrap();
2284 let path = dir.path().join("note.md");
2285 std::fs::write(&path, "\u{feff}---\ntype: note\nsummary: x\n---\nbody\n").unwrap();
2286
2287 let (fm, body) = read_file(&path).expect("BOM-prefixed file must parse");
2288 assert_eq!(fm.type_.as_deref(), Some("note"));
2289 assert_eq!(fm.summary.as_deref(), Some("x"));
2290 assert_eq!(body, "body\n");
2291 }
2292
2293 #[test]
2294 fn to_yaml_preserves_unquoted_scalar_wiki_link_round_trip() {
2295 // Regression (PRIMARY): the SPEC-canonical scalar wiki-link is the
2296 // *unquoted* inline `company: [[records/companies/northstar]]`
2297 // (SPEC § Linking, the worked `contact` example). YAML parses it to the
2298 // nested `Seq[Seq[String]]` shape and `parse` stores that verbatim in
2299 // `extra`. Before the fix, `to_yaml` re-emitted it block-style as
2300 // company:
2301 // - - records/companies/northstar
2302 // — the `[[ ]]` brackets GONE — so a no-op re-emit (`dbmd format`, and
2303 // any `fm set` / `link` write) silently destroyed the link.
2304 let yaml = "type: contact\ncompany: [[records/companies/northstar]]";
2305 let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
2306 // Sanity: it really parsed as the nested sequence, not a string.
2307 assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_none());
2308
2309 let out = fm.to_yaml();
2310 // The link must survive as a quoted inline scalar — brackets intact, and
2311 // never the bracket-less block sequence `- - records/...`.
2312 assert!(
2313 out.contains("[[records/companies/northstar]]"),
2314 "canonical writer dropped the wiki-link brackets; got:\n{out}"
2315 );
2316 assert!(
2317 !out.contains("- - "),
2318 "canonical writer emitted a nested block sequence (link corrupted); got:\n{out}"
2319 );
2320
2321 // And it round-trips: re-parsing the emitted YAML still surfaces exactly
2322 // one link with the right target (the edge graph/backlinks rely on).
2323 let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
2324 let fields = reparsed.link_fields();
2325 let links: Vec<(&str, &str, Option<&str>)> = fields
2326 .iter()
2327 .map(|(k, l)| (k.as_str(), l.target.as_str(), l.display.as_deref()))
2328 .collect();
2329 assert_eq!(
2330 links,
2331 vec![("company", "records/companies/northstar", None)]
2332 );
2333
2334 // A second re-emit is a fixed point — no progressive corruption across
2335 // repeated curator-loop writes.
2336 assert_eq!(
2337 reparsed.to_yaml(),
2338 out,
2339 "to_yaml is not idempotent on links"
2340 );
2341 }
2342
2343 #[test]
2344 fn to_yaml_preserves_unquoted_scalar_link_with_display() {
2345 // The `|display` segment must survive the unquoted-inline round-trip too.
2346 let yaml = "type: contact\ncompany: [[records/companies/northstar|Northstar]]";
2347 let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
2348 let out = fm.to_yaml();
2349 assert!(
2350 out.contains("[[records/companies/northstar|Northstar]]"),
2351 "display segment lost on round-trip; got:\n{out}"
2352 );
2353 let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
2354 let f = reparsed.link_fields();
2355 assert_eq!(f.len(), 1);
2356 assert_eq!(f[0].1.target, "records/companies/northstar");
2357 assert_eq!(f[0].1.display.as_deref(), Some("Northstar"));
2358 }
2359
2360 #[test]
2361 fn to_yaml_does_not_mangle_link_list_or_plain_nested_sequence() {
2362 // A genuine quoted block list of links round-trips as a clean string
2363 // list — never collapsed to a scalar — and a plain nested sequence that
2364 // is NOT a wiki-link is left exactly as written (no false conversion).
2365 let yaml = "type: meeting\nattendees:\n - \"[[records/contacts/elena]]\"\n - \"[[records/contacts/sarah]]\"\nmatrix:\n - - 1\n - 2";
2366 let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
2367 let out = fm.to_yaml();
2368
2369 // Both attendee links survive as quoted strings.
2370 assert!(out.contains("[[records/contacts/elena]]"), "got:\n{out}");
2371 assert!(out.contains("[[records/contacts/sarah]]"), "got:\n{out}");
2372
2373 let reparsed = Frontmatter::parse(&out, Path::new("m.md")).unwrap();
2374 let fields = reparsed.link_fields();
2375 let attendees: Vec<&str> = fields
2376 .iter()
2377 .filter(|(k, _)| k == "attendees")
2378 .map(|(_, l)| l.target.as_str())
2379 .collect();
2380 assert_eq!(
2381 attendees,
2382 vec!["records/contacts/elena", "records/contacts/sarah"]
2383 );
2384 // The non-link nested sequence is preserved verbatim, not touched.
2385 assert_eq!(reparsed.extra.get("matrix"), fm.extra.get("matrix"));
2386 }
2387
2388 // ── read_file / write_file round-trip ────────────────────────────────────
2389
2390 #[test]
2391 fn write_then_read_roundtrips_and_preserves_body_verbatim() {
2392 let dir = tempdir().unwrap();
2393 let path = dir.path().join("sources/emails/x.md");
2394 let body = "# Subject\n\nHello,\n\nSee [[records/contacts/sarah-chen]].\n";
2395 let mut fm = Frontmatter {
2396 type_: Some("email".into()),
2397 summary: Some("renewal note".into()),
2398 created: Some(DateTime::parse_from_rfc3339("2026-05-27T08:00:00-07:00").unwrap()),
2399 ..Default::default()
2400 };
2401 fm.extra
2402 .insert("from".into(), Value::String("elena@northstar.io".into()));
2403
2404 write_file(&path, &fm, body).unwrap();
2405
2406 let (read_fm, read_body) = read_file(&path).unwrap();
2407 assert_eq!(read_body, body, "body must be preserved byte-for-byte");
2408 assert_eq!(read_fm.type_.as_deref(), Some("email"));
2409 assert_eq!(read_fm.summary.as_deref(), Some("renewal note"));
2410 assert_eq!(
2411 read_fm.extra.get("from").and_then(|v| v.as_str()),
2412 Some("elena@northstar.io")
2413 );
2414 // The on-disk file starts with a fence and ends with the verbatim body.
2415 let raw = std::fs::read_to_string(&path).unwrap();
2416 assert!(raw.starts_with("---\n"));
2417 assert!(raw.ends_with(body));
2418 }
2419
2420 #[test]
2421 fn roundtrip_modify_summary_then_write_changes_only_summary() {
2422 let dir = tempdir().unwrap();
2423 let path = dir.path().join("records/contacts/sarah.md");
2424 let body = "Long-form operator notes about Sarah.\n";
2425 let fm = Frontmatter {
2426 type_: Some("contact".into()),
2427 summary: Some("old summary".into()),
2428 ..Default::default()
2429 };
2430 write_file(&path, &fm, body).unwrap();
2431
2432 // Read → modify summary → write back.
2433 let (mut fm2, body2) = read_file(&path).unwrap();
2434 fm2.summary = Some("new summary".into());
2435 write_file(&path, &fm2, &body2).unwrap();
2436
2437 let (fm3, body3) = read_file(&path).unwrap();
2438 assert_eq!(fm3.summary.as_deref(), Some("new summary"));
2439 assert_eq!(fm3.type_.as_deref(), Some("contact"));
2440 assert_eq!(body3, body, "body unchanged across the round-trip");
2441 }
2442
2443 #[test]
2444 fn roundtrip_preserves_handwritten_unquoted_scalar_wiki_link_on_disk() {
2445 // End-to-end analog of `dbmd format` on the verbatim SPEC worked example:
2446 // a hand-written file carrying the canonical UNQUOTED scalar link
2447 // `company: [[records/companies/northstar]]`, read from disk then written
2448 // back unchanged. Before the fix this no-op re-emit rewrote the on-disk
2449 // value to the bracket-less block sequence `company:\n- - records/...`,
2450 // and every reader (validate/graph/backlinks) then lost the edge.
2451 let dir = tempdir().unwrap();
2452 let path = dir.path().join("records/contacts/sarah-chen.md");
2453 let file = "---\ntype: contact\nid: sarah-chen\nsummary: Director of Ops\ncompany: [[records/companies/northstar]]\n---\n# Sarah Chen\n\nNotes.\n";
2454 std::fs::create_dir_all(path.parent().unwrap()).unwrap();
2455 std::fs::write(&path, file).unwrap();
2456
2457 // Read → write back unchanged (the canonical no-op re-emit).
2458 let (fm, body) = read_file(&path).unwrap();
2459 write_file(&path, &fm, &body).unwrap();
2460
2461 // On-disk bytes still carry the bracketed link, never `- - records/...`.
2462 let raw = std::fs::read_to_string(&path).unwrap();
2463 assert!(
2464 raw.contains("[[records/companies/northstar]]"),
2465 "on-disk wiki-link brackets were destroyed; got:\n{raw}"
2466 );
2467 assert!(
2468 !raw.contains("- - "),
2469 "on-disk value became a nested block sequence; got:\n{raw}"
2470 );
2471
2472 // And the edge is still readable after the round-trip.
2473 let (fm2, _) = read_file(&path).unwrap();
2474 let fields = fm2.link_fields();
2475 let links: Vec<(&str, &str)> = fields
2476 .iter()
2477 .map(|(k, l)| (k.as_str(), l.target.as_str()))
2478 .collect();
2479 assert_eq!(links, vec![("company", "records/companies/northstar")]);
2480 }
2481
2482 #[test]
2483 fn write_file_does_not_leave_temp_files_behind() {
2484 let dir = tempdir().unwrap();
2485 let path = dir.path().join("records/x.md");
2486 let fm = Frontmatter {
2487 type_: Some("note".into()),
2488 ..Default::default()
2489 };
2490 write_file(&path, &fm, "body\n").unwrap();
2491 // The directory should contain only the target file, no `.x.md.tmp.*`.
2492 let entries: Vec<String> = std::fs::read_dir(path.parent().unwrap())
2493 .unwrap()
2494 .map(|e| e.unwrap().file_name().to_string_lossy().into_owned())
2495 .collect();
2496 assert_eq!(entries, vec!["x.md".to_string()]);
2497 }
2498
2499 // ── is_content_file ──────────────────────────────────────────────────────
2500
2501 #[test]
2502 fn is_content_file_recognizes_layers_and_excludes_meta() {
2503 assert!(Frontmatter::is_content_file(Path::new(
2504 "sources/emails/2026-05-22.md"
2505 )));
2506 assert!(Frontmatter::is_content_file(Path::new(
2507 "records/contacts/sarah-chen.md"
2508 )));
2509 assert!(Frontmatter::is_content_file(Path::new(
2510 "wiki/people/sarah-chen.md"
2511 )));
2512 // Absolute paths under a layer are still content.
2513 assert!(Frontmatter::is_content_file(Path::new(
2514 "/home/db/records/companies/northstar.md"
2515 )));
2516 // index.md at any level is meta.
2517 assert!(!Frontmatter::is_content_file(Path::new(
2518 "records/contacts/index.md"
2519 )));
2520 assert!(!Frontmatter::is_content_file(Path::new("index.md")));
2521 // Root meta files.
2522 assert!(!Frontmatter::is_content_file(Path::new("DB.md")));
2523 assert!(!Frontmatter::is_content_file(Path::new("log.md")));
2524 }
2525
2526 // ── effective_id ─────────────────────────────────────────────────────────
2527
2528 #[test]
2529 fn effective_id_prefers_explicit_then_derives_from_path() {
2530 let with_id = Frontmatter {
2531 id: Some("explicit-id".into()),
2532 ..Default::default()
2533 };
2534 assert_eq!(
2535 with_id.effective_id(Path::new("wiki/people/sarah-chen.md")),
2536 "explicit-id"
2537 );
2538 let no_id = Frontmatter::default();
2539 assert_eq!(
2540 no_id.effective_id(Path::new("wiki/people/sarah-chen.md")),
2541 "sarah-chen"
2542 );
2543 }
2544
2545 // ── get / set ────────────────────────────────────────────────────────────
2546
2547 #[test]
2548 fn set_routes_universal_and_custom_keys() {
2549 let mut fm = Frontmatter::default();
2550 fm.set("type", "contact").unwrap();
2551 fm.set("summary", "hi").unwrap();
2552 fm.set("company", "[[records/companies/northstar]]")
2553 .unwrap();
2554 assert_eq!(fm.type_.as_deref(), Some("contact"));
2555 assert_eq!(fm.summary.as_deref(), Some("hi"));
2556 // Custom key landed in extra, not a typed slot.
2557 assert_eq!(
2558 fm.extra.get("company").and_then(|v| v.as_str()),
2559 Some("[[records/companies/northstar]]")
2560 );
2561 // get reads from both typed fields and extra.
2562 assert_eq!(
2563 fm.get("type").and_then(|v| v.as_str().map(String::from)),
2564 Some("contact".into())
2565 );
2566 assert_eq!(
2567 fm.get("company").and_then(|v| v.as_str().map(String::from)),
2568 Some("[[records/companies/northstar]]".into())
2569 );
2570 assert!(fm.get("nonexistent").is_none());
2571 }
2572
2573 #[test]
2574 fn set_timestamp_validates_rfc3339() {
2575 let mut fm = Frontmatter::default();
2576 fm.set("created", "2026-05-27T08:00:00-07:00").unwrap();
2577 assert!(fm.created.is_some());
2578 let err = fm.set("updated", "not-a-date").unwrap_err();
2579 assert!(matches!(err, ParseError::BadTimestamp { .. }));
2580 }
2581
2582 // ── extract_wiki_links ───────────────────────────────────────────────────
2583
2584 #[test]
2585 fn extract_wiki_links_flags_full_path_short_form_and_extension() {
2586 let body = "See [[records/contacts/sarah-chen]] and [[sarah-chen]].\nAlso [[wiki/people/sarah-chen.md|Sarah]].\n";
2587 let links = extract_wiki_links(body, Path::new("doc.md"));
2588 assert_eq!(links.len(), 3);
2589
2590 // Full path, no extension, no display.
2591 assert_eq!(links[0].target, "records/contacts/sarah-chen");
2592 assert!(links[0].is_full_path);
2593 assert!(!links[0].has_md_extension);
2594 assert_eq!(links[0].display, None);
2595 assert_eq!(links[0].location.1, 1, "first link on line 1");
2596
2597 // Short form: not a full path.
2598 assert_eq!(links[1].target, "sarah-chen");
2599 assert!(!links[1].is_full_path, "bare target is short-form");
2600
2601 // Full path WITH .md extension and a display override on line 2.
2602 assert_eq!(links[2].target, "wiki/people/sarah-chen.md");
2603 assert!(links[2].is_full_path);
2604 assert!(links[2].has_md_extension);
2605 assert_eq!(links[2].display.as_deref(), Some("Sarah"));
2606 assert_eq!(links[2].location.1, 2);
2607 }
2608
2609 #[test]
2610 fn extract_wiki_links_reports_1_based_column_counting_chars() {
2611 // A multi-byte prefix (é is 2 bytes) must not skew the char column.
2612 let body = "café [[records/x/y]]";
2613 let links = extract_wiki_links(body, Path::new("d.md"));
2614 assert_eq!(links.len(), 1);
2615 // "café " is 5 chars, so the `[[` starts at char column 6 (1-based).
2616 assert_eq!(links[0].location.2, 6);
2617 }
2618
2619 #[test]
2620 fn extract_wiki_links_ignores_a_lone_path_without_brackets() {
2621 let links = extract_wiki_links(
2622 "records/contacts/sarah-chen is not a link",
2623 Path::new("d.md"),
2624 );
2625 assert!(links.is_empty());
2626 }
2627
2628 // ── extract_markdown_links ───────────────────────────────────────────────
2629
2630 #[test]
2631 fn extract_markdown_links_captures_external_and_not_wiki_links() {
2632 let body =
2633 "See [the thread](https://x.com/a) and [[records/contacts/sarah-chen]] internally.\n";
2634 let md = extract_markdown_links(body, Path::new("d.md"));
2635 assert_eq!(
2636 md.len(),
2637 1,
2638 "wiki-link must not be captured as a markdown link"
2639 );
2640 assert_eq!(md[0].text, "the thread");
2641 assert_eq!(md[0].url, "https://x.com/a");
2642 assert_eq!(md[0].location.1, 1);
2643
2644 // And the wiki-link extractor must not pick up the markdown link.
2645 let wl = extract_wiki_links(body, Path::new("d.md"));
2646 assert_eq!(wl.len(), 1);
2647 assert_eq!(wl[0].target, "records/contacts/sarah-chen");
2648 }
2649
2650 // ── link_fields ──────────────────────────────────────────────────────────
2651
2652 #[test]
2653 fn link_fields_extracts_scalar_list_and_summary_links() {
2654 // The canonical list form quotes each item so YAML parses it as clean
2655 // strings; a scalar field may be quoted OR written in the canonical
2656 // unquoted inline form `company: [[x]]` (SPEC § Linking).
2657 let yaml = "type: meeting\nsummary: with [[records/contacts/elena]]\ncompany: \"[[records/companies/northstar]]\"\nattendees:\n - \"[[records/contacts/elena]]\"\n - \"[[records/contacts/sarah]]\"\nnotes: just plain text";
2658 let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
2659 // Sanity: company really did parse as a scalar string here.
2660 assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_some());
2661 let fields = fm.link_fields();
2662
2663 // company (scalar) once, with the right target.
2664 let company: Vec<&str> = fields
2665 .iter()
2666 .filter(|(k, _)| k == "company")
2667 .map(|(_, l)| l.target.as_str())
2668 .collect();
2669 assert_eq!(company, vec!["records/companies/northstar"]);
2670 // attendees (block list) twice.
2671 let attendees: Vec<&str> = fields
2672 .iter()
2673 .filter(|(k, _)| k == "attendees")
2674 .map(|(_, l)| l.target.as_str())
2675 .collect();
2676 assert_eq!(
2677 attendees,
2678 vec!["records/contacts/elena", "records/contacts/sarah"]
2679 );
2680 // summary link surfaced.
2681 assert_eq!(fields.iter().filter(|(k, _)| k == "summary").count(), 1);
2682 // Plain-text field is not a link.
2683 assert_eq!(fields.iter().filter(|(k, _)| k == "notes").count(), 0);
2684 }
2685
2686 #[test]
2687 fn link_fields_surfaces_canonical_unquoted_scalar_link() {
2688 // Regression: the canonical scalar wiki-link form is the *unquoted*
2689 // inline `company: [[records/companies/northstar]]` (SPEC § Linking).
2690 // YAML parses `[[x]]` as a flow-list-in-a-list (`Seq[Seq[String]]`), so
2691 // a naive `as_str()`-only walk drops it. link_fields() must still
2692 // surface exactly one link with the correct target.
2693 let yaml = "type: meeting\ncompany: [[records/companies/northstar]]";
2694 let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
2695 // Sanity: it really did parse as the nested sequence form, NOT a string.
2696 assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_none());
2697
2698 let fields = fm.link_fields();
2699 let links: Vec<(&str, &str, Option<&str>)> = fields
2700 .iter()
2701 .map(|(k, l)| (k.as_str(), l.target.as_str(), l.display.as_deref()))
2702 .collect();
2703 assert_eq!(
2704 links,
2705 vec![("company", "records/companies/northstar", None)]
2706 );
2707
2708 // The `|display` segment survives the unquoted inline form too.
2709 let fm2 = Frontmatter::parse(
2710 "type: meeting\ncompany: [[records/companies/northstar|Northstar]]",
2711 Path::new("m.md"),
2712 )
2713 .unwrap();
2714 let f2 = fm2.link_fields();
2715 assert_eq!(f2.len(), 1);
2716 assert_eq!(f2[0].0, "company");
2717 assert_eq!(f2[0].1.target, "records/companies/northstar");
2718 assert_eq!(f2[0].1.display.as_deref(), Some("Northstar"));
2719 }
2720
2721 #[test]
2722 fn link_fields_ignores_plain_one_item_flow_list() {
2723 // A plain one-item flow list `aliases: [foo]` parses to `Seq[String]`
2724 // — one nesting level shallower than an unquoted `[[foo]]` — and must
2725 // NOT be mistaken for a wiki-link.
2726 let yaml = "type: contact\naliases: [foo]";
2727 let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
2728 assert_eq!(fm.link_fields(), Vec::new());
2729 }
2730
2731 // ── detect_flow_form_link_lists ──────────────────────────────────────────
2732
2733 #[test]
2734 fn detect_flow_form_flags_list_misencodings_not_scalars() {
2735 // The flow-form list mis-encoding (triple-nested) IS flagged; a scalar
2736 // inline wiki-link (double-nested) is NOT.
2737 let bad = "attendees: [[[records/x]], [[records/y]]]\nscalar_inline: [[records/z]]";
2738 let flagged = detect_flow_form_link_lists(bad);
2739 assert_eq!(flagged, vec!["attendees".to_string()]);
2740
2741 // An UNquoted block list is also a mis-encoding (parses triple-nested).
2742 let unquoted_block = "attendees:\n - [[records/x]]\n - [[records/y]]";
2743 assert_eq!(
2744 detect_flow_form_link_lists(unquoted_block),
2745 vec!["attendees".to_string()]
2746 );
2747
2748 // The canonical QUOTED block form parses to clean strings — NOT flagged.
2749 let good = "attendees:\n - \"[[records/x]]\"\n - \"[[records/y]]\"";
2750 assert!(detect_flow_form_link_lists(good).is_empty());
2751
2752 // A plain scalar list of strings is not flagged.
2753 let plain = "tags: [a, b, c]";
2754 assert!(detect_flow_form_link_lists(plain).is_empty());
2755 }
2756
2757 // ── extract_sections ─────────────────────────────────────────────────────
2758
2759 #[test]
2760 fn extract_sections_levels_nesting_and_boundaries() {
2761 let body = "intro text\n## First\nalpha\n### Sub\nbeta\n## Second\ngamma\n";
2762 let secs = extract_sections(body);
2763 let headings: Vec<(&str, u8)> =
2764 secs.iter().map(|s| (s.heading.as_str(), s.level)).collect();
2765 assert_eq!(headings, vec![("First", 2), ("Sub", 3), ("Second", 2)]);
2766
2767 // "First" (H2) body extends through its H3 child, stopping at "Second".
2768 let first = &secs[0];
2769 assert!(first.body.contains("alpha"));
2770 assert!(first.body.contains("### Sub"));
2771 assert!(first.body.contains("beta"));
2772 assert!(!first.body.contains("Second"));
2773
2774 // "Sub" (H3) stops at the next equal-or-shallower heading ("Second").
2775 let sub = &secs[1];
2776 assert!(sub.body.contains("beta"));
2777 assert!(!sub.body.contains("gamma"));
2778
2779 // 1-based line numbers within the body.
2780 assert_eq!(first.line, 2);
2781 assert_eq!(secs[2].line, 6);
2782 }
2783
2784 #[test]
2785 fn extract_sections_ignores_headings_in_fenced_code() {
2786 let body = "## Real\n```\n## Fake heading in code\n```\nafter\n";
2787 let secs = extract_sections(body);
2788 assert_eq!(secs.len(), 1);
2789 assert_eq!(secs[0].heading, "Real");
2790 // The fenced "## Fake" is part of Real's body, not its own section.
2791 assert!(secs[0].body.contains("## Fake heading in code"));
2792 }
2793
2794 // ── parse_field_spec ─────────────────────────────────────────────────────
2795
2796 #[test]
2797 fn parse_field_spec_required_and_shape() {
2798 let f = parse_field_spec("- email (required, email)");
2799 assert_eq!(f.name, "email");
2800 assert!(f.required);
2801 assert_eq!(f.shape, Some(Shape::Email));
2802 assert!(f.unknown_modifiers.is_empty());
2803 }
2804
2805 #[test]
2806 fn parse_field_spec_link_prefix_strips_trailing_slash() {
2807 let f = parse_field_spec("- company (required, link to records/companies/)");
2808 assert!(f.required);
2809 assert_eq!(f.link_prefix, Some(PathBuf::from("records/companies")));
2810 assert_eq!(f.shape, None);
2811 }
2812
2813 #[test]
2814 fn parse_field_spec_default_preserves_case_and_value() {
2815 let f = parse_field_spec("- currency (default USD)");
2816 assert_eq!(f.name, "currency");
2817 assert_eq!(f.default, Some(Value::String("USD".into())));
2818 }
2819
2820 #[test]
2821 fn parse_field_spec_enum_captures_comma_list_as_last_modifier() {
2822 let f = parse_field_spec("- status (required, enum: open, closed, pending)");
2823 assert!(f.required);
2824 assert_eq!(
2825 f.enum_values,
2826 Some(vec![
2827 "open".to_string(),
2828 "closed".to_string(),
2829 "pending".to_string()
2830 ])
2831 );
2832 }
2833
2834 #[test]
2835 fn parse_field_spec_bare_enum_keyword_is_not_itself_a_value() {
2836 // `enum` with no colon: the values are the remaining tokens; the keyword
2837 // itself must NOT leak in as an allowed value.
2838 let f = parse_field_spec("- status (required, enum, open, closed)");
2839 assert!(f.required);
2840 assert_eq!(
2841 f.enum_values,
2842 Some(vec!["open".to_string(), "closed".to_string()])
2843 );
2844 }
2845
2846 #[test]
2847 fn parse_field_spec_unknown_modifier_is_captured_not_errored() {
2848 let f = parse_field_spec("- weird (required, frobnicate, string)");
2849 assert!(f.required);
2850 assert_eq!(f.shape, Some(Shape::String));
2851 assert_eq!(f.unknown_modifiers, vec!["frobnicate".to_string()]);
2852 }
2853
2854 #[test]
2855 fn parse_field_spec_no_parens_is_freeform_optional() {
2856 let f = parse_field_spec("- nickname");
2857 assert_eq!(f.name, "nickname");
2858 assert!(!f.required);
2859 assert_eq!(f.shape, None);
2860 assert!(f.link_prefix.is_none());
2861 assert!(f.enum_values.is_none());
2862 assert!(f.unknown_modifiers.is_empty());
2863 }
2864
2865 // ── parse_schema_bullet (directives) ─────────────────────────────────────
2866
2867 #[test]
2868 fn schema_bullet_unique_single_field() {
2869 match parse_schema_bullet("- unique: email") {
2870 SchemaBullet::Unique(fields) => assert_eq!(fields, vec!["email".to_string()]),
2871 other => panic!("expected Unique, got {other:?}"),
2872 }
2873 }
2874
2875 #[test]
2876 fn schema_bullet_unique_compound_trims_and_splits() {
2877 match parse_schema_bullet("- unique: date, amount , vendor") {
2878 SchemaBullet::Unique(fields) => assert_eq!(
2879 fields,
2880 vec![
2881 "date".to_string(),
2882 "amount".to_string(),
2883 "vendor".to_string()
2884 ]
2885 ),
2886 other => panic!("expected Unique, got {other:?}"),
2887 }
2888 }
2889
2890 #[test]
2891 fn schema_bullet_summary_template_keeps_braces_and_inner_colons() {
2892 match parse_schema_bullet("- summary_template: {role} at {company} (x: y)") {
2893 SchemaBullet::SummaryTemplate(t) => assert_eq!(t, "{role} at {company} (x: y)"),
2894 other => panic!("expected SummaryTemplate, got {other:?}"),
2895 }
2896 }
2897
2898 #[test]
2899 fn schema_bullet_field_with_enum_modifier_is_not_a_directive() {
2900 // A field whose modifiers contain a colon (`enum:`) parses as a field, not
2901 // a directive — its head has a `(` before any `:`.
2902 match parse_schema_bullet("- status (enum: open, closed)") {
2903 SchemaBullet::Field(f) => {
2904 assert_eq!(f.name, "status");
2905 assert_eq!(
2906 f.enum_values,
2907 Some(vec!["open".to_string(), "closed".to_string()])
2908 );
2909 }
2910 other => panic!("expected Field, got {other:?}"),
2911 }
2912 }
2913
2914 #[test]
2915 fn parse_db_md_schema_captures_unique_and_summary_template() {
2916 let db = "---\ntype: db-md\nscope: x\nowner: y\n---\n\n## Schemas\n\n### contact\n- email (required, email)\n- unique: email\n- summary_template: {role} at {company}\n";
2917 let config = parse_db_md(db, Path::new("DB.md")).unwrap();
2918 let s = config.schemas.get("contact").expect("contact schema");
2919 assert_eq!(s.fields.len(), 1, "directives are not parsed as fields");
2920 assert_eq!(s.unique_keys, vec![vec!["email".to_string()]]);
2921 assert_eq!(s.summary_template.as_deref(), Some("{role} at {company}"));
2922 }
2923
2924 #[test]
2925 fn schema_bullet_shard_directive_parses_values() {
2926 assert!(matches!(
2927 parse_schema_bullet("- shard: by-date"),
2928 SchemaBullet::Shard(Some(true))
2929 ));
2930 assert!(matches!(
2931 parse_schema_bullet("- shard: flat"),
2932 SchemaBullet::Shard(Some(false))
2933 ));
2934 // An unrecognized value is ignored (None), like an unknown modifier.
2935 assert!(matches!(
2936 parse_schema_bullet("- shard: weekly"),
2937 SchemaBullet::Shard(None)
2938 ));
2939 // A field whose name has a `(` before any `:` is still a field — the same
2940 // guard that keeps `- status (enum: a, b)` a field, not a directive.
2941 assert!(matches!(
2942 parse_schema_bullet("- shardiness (string)"),
2943 SchemaBullet::Field(_)
2944 ));
2945 }
2946
2947 #[test]
2948 fn parse_db_md_schema_captures_shard_directive() {
2949 let db = "---\ntype: db-md\nscope: x\nowner: y\n---\n\n## Schemas\n\n### shipment\n- carrier (string)\n- shard: by-date\n\n### contact\n- shard: flat\n";
2950 let config = parse_db_md(db, Path::new("DB.md")).unwrap();
2951 let shipment = config.schemas.get("shipment").expect("shipment schema");
2952 assert_eq!(shipment.shard, Some(true));
2953 assert_eq!(
2954 shipment.fields.len(),
2955 1,
2956 "`shard:` is a directive, not a field"
2957 );
2958 assert_eq!(config.schemas.get("contact").unwrap().shard, Some(false));
2959 }
2960
2961 // ── parse_db_md ──────────────────────────────────────────────────────────
2962
2963 const CANONICAL_DB_MD: &str = "---\ntype: db-md\nscope: company\nowner: Sarah Chen\n---\n\n# Acme operations knowledge base\n\nCompany-scale institutional memory for Acme.\n\n## Agent instructions\n\nPrioritize creating `contact` records from new-sender emails. Use British English.\n\n## Policies\n\n### Frozen pages\n- `records/decisions/2026-q1-strategy.md` — finalized, do not modify.\n- `wiki/synthesis/2026-annual-plan.md` — signed-off plan.\n\n### Ignored types\n- `test`, `temp` — read but never synthesize.\n\n## Schemas\n\n### contact\n- name (required)\n- email (required, email)\n- company (required, link to records/companies/)\n- role (string)\n\n### expense\n- date (required, date)\n- amount (required)\n- currency (default USD)\n";
2964
2965 #[test]
2966 fn parse_db_md_extracts_all_canonical_sections() {
2967 let config = parse_db_md(CANONICAL_DB_MD, Path::new("DB.md")).unwrap();
2968
2969 // Agent instructions: free-form prose, heading line stripped.
2970 let ai = config
2971 .agent_instructions
2972 .expect("agent instructions present");
2973 assert!(ai.starts_with("Prioritize creating"));
2974 assert!(!ai.contains("## Agent instructions"));
2975
2976 // Frozen pages: paths extracted from backticked bullets, comments dropped.
2977 assert_eq!(
2978 config.frozen_pages,
2979 vec![
2980 PathBuf::from("records/decisions/2026-q1-strategy.md"),
2981 PathBuf::from("wiki/synthesis/2026-annual-plan.md"),
2982 ]
2983 );
2984
2985 // Ignored types: comma list, backticks/comment stripped.
2986 assert_eq!(
2987 config.ignored_types,
2988 vec!["test".to_string(), "temp".to_string()]
2989 );
2990
2991 // Schemas: two types, each with its fields in source order.
2992 assert_eq!(config.schemas.len(), 2);
2993 let contact = config.schemas.get("contact").expect("contact schema");
2994 let names: Vec<&str> = contact.fields.iter().map(|f| f.name.as_str()).collect();
2995 assert_eq!(names, vec!["name", "email", "company", "role"]);
2996 assert!(contact.fields[0].required); // name
2997 assert_eq!(contact.fields[1].shape, Some(Shape::Email)); // email
2998 assert_eq!(
2999 contact.fields[2].link_prefix,
3000 Some(PathBuf::from("records/companies"))
3001 ); // company
3002
3003 let expense = config.schemas.get("expense").expect("expense schema");
3004 let cur = expense
3005 .fields
3006 .iter()
3007 .find(|f| f.name == "currency")
3008 .unwrap();
3009 assert_eq!(cur.default, Some(Value::String("USD".into())));
3010 }
3011
3012 #[test]
3013 fn parse_db_md_handles_malformed_and_unknown_modifiers() {
3014 // corpus-b shape: a `## Schemas` section with a malformed bullet, an
3015 // unknown modifier, and bullets that appear with NO `### <type>`
3016 // heading (so they belong to no schema and are dropped).
3017 let text = "---\ntype: db-md\n---\n\n## Schemas\n- orphan (required)\n\n### ticket\n- priority (required, mystery, enum: low, high)\n- broken (\n";
3018 let config = parse_db_md(text, Path::new("DB.md")).unwrap();
3019
3020 // The orphan bullet under `## Schemas` with no `### type` heading is not
3021 // captured as a schema.
3022 assert_eq!(config.schemas.len(), 1);
3023 let ticket = config.schemas.get("ticket").expect("ticket schema");
3024 assert_eq!(ticket.fields.len(), 2);
3025
3026 let priority = &ticket.fields[0];
3027 assert!(priority.required);
3028 assert_eq!(priority.unknown_modifiers, vec!["mystery".to_string()]);
3029 assert_eq!(
3030 priority.enum_values,
3031 Some(vec!["low".to_string(), "high".to_string()])
3032 );
3033
3034 // A bullet with an unclosed paren still yields a usable name.
3035 let broken = &ticket.fields[1];
3036 assert_eq!(broken.name, "broken");
3037 }
3038
3039 #[test]
3040 fn parse_db_md_missing_frontmatter_errors() {
3041 let text = "# No frontmatter\n\n## Agent instructions\nhi\n";
3042 let err = parse_db_md(text, Path::new("DB.md")).unwrap_err();
3043 assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
3044 }
3045
3046 #[test]
3047 fn parse_db_md_absent_sections_default_empty() {
3048 let text = "---\ntype: db-md\n---\n\n# Title only\n";
3049 let config = parse_db_md(text, Path::new("DB.md")).unwrap();
3050 assert_eq!(config, Config::default());
3051 }
3052
3053 // ── fm set / --fm list-valued link fields (meeting.attendees & friends) ──
3054
3055 /// `Frontmatter::set` is the value path every write surface (`fm set`,
3056 /// `write --fm`) funnels through. A list-of-wiki-links value (the SPEC's
3057 /// `meeting.attendees` shape) must serialize as a YAML **block sequence** of
3058 /// quoted links — readable back by [`links_in_field_value`] and accepted by
3059 /// `dbmd validate` — never the flow-form scalar string that trips
3060 /// `WIKI_LINK_FLOW_FORM_LIST`. Both the unquoted (`[[[a]], [[b]]]`) and
3061 /// quoted (`["[[a]]", "[[b]]"]`) spellings an agent types must normalize.
3062 #[test]
3063 fn set_list_of_wiki_links_becomes_block_sequence_both_spellings() {
3064 for value in [
3065 "[[[records/contacts/a]], [[records/contacts/b]]]",
3066 r#"["[[records/contacts/a]]", "[[records/contacts/b]]"]"#,
3067 ] {
3068 let mut fm = Frontmatter::default();
3069 fm.set("attendees", value).unwrap();
3070
3071 // Stored as a 2-element sequence of clean quoted links.
3072 let stored = fm.extra.get("attendees").expect("attendees set");
3073 let Value::Sequence(items) = stored else {
3074 panic!("attendees must be a Sequence, got {stored:?} for input {value}");
3075 };
3076 assert_eq!(items.len(), 2, "input {value}");
3077 assert_eq!(items[0], Value::String("[[records/contacts/a]]".into()));
3078 assert_eq!(items[1], Value::String("[[records/contacts/b]]".into()));
3079
3080 // The edge enumerator reads exactly the two links back (no stray
3081 // bracket targets, the flow-form-string symptom).
3082 let links: Vec<_> = links_in_field_value(stored)
3083 .into_iter()
3084 .map(|l| l.target)
3085 .collect();
3086 assert_eq!(
3087 links,
3088 vec!["records/contacts/a", "records/contacts/b"],
3089 "input {value}"
3090 );
3091
3092 // And the canonical writer renders it block-style, not as a scalar.
3093 let yaml = fm.to_yaml();
3094 assert!(
3095 yaml.contains("attendees:\n"),
3096 "expected block list in:\n{yaml}"
3097 );
3098 assert!(
3099 !yaml.contains("attendees: '[["),
3100 "must not be a flow-form scalar string in:\n{yaml}"
3101 );
3102 }
3103 }
3104
3105 /// A *single* inline wiki-link stays a scalar string (renders inline
3106 /// `field: [[x]]`), and a single link must never be widened to a one-item
3107 /// list — preserving the common `contact.company` / `expense.vendor` shape.
3108 #[test]
3109 fn set_single_inline_wiki_link_stays_scalar() {
3110 let mut fm = Frontmatter::default();
3111 fm.set("company", "[[records/companies/tideform]]").unwrap();
3112 assert_eq!(
3113 fm.extra.get("company"),
3114 Some(&Value::String("[[records/companies/tideform]]".into())),
3115 );
3116 // Still recognized as one link.
3117 let links: Vec<_> = links_in_field_value(fm.extra.get("company").unwrap())
3118 .into_iter()
3119 .map(|l| l.target)
3120 .collect();
3121 assert_eq!(links, vec!["records/companies/tideform"]);
3122 }
3123
3124 /// Plain text and a non-link flow list are left as verbatim scalar strings —
3125 /// the list normalization only triggers when every item is a clean wiki-link.
3126 #[test]
3127 fn set_non_link_values_stay_scalar_strings() {
3128 let mut fm = Frontmatter::default();
3129 fm.set("location", "Video call (remote)").unwrap();
3130 assert_eq!(
3131 fm.extra.get("location"),
3132 Some(&Value::String("Video call (remote)".into())),
3133 );
3134
3135 // A flow list whose items are NOT wiki-links must not be reinterpreted as
3136 // a link sequence; it stays the scalar string the agent passed.
3137 fm.set("note", "[draft, wip]").unwrap();
3138 assert_eq!(
3139 fm.extra.get("note"),
3140 Some(&Value::String("[draft, wip]".into()))
3141 );
3142 }
3143
3144 // ── Regression: non-string YAML keys round-trip (no Rust Debug corruption) ─
3145
3146 #[test]
3147 fn regression_non_string_yaml_keys_keep_their_text_on_round_trip() {
3148 // A numeric/bool/null/float frontmatter key is valid YAML and must NOT be
3149 // rewritten to its Rust `Debug` form (`Number(2026)`, `Bool(true)`,
3150 // `'Null'`). After the fix the key text survives (the key narrows to a
3151 // string-typed key, but the operator's data is no longer corrupted).
3152 let yaml = "type: note\n2026: planning notes\ntrue: yes-key\n3.14: f\n";
3153 let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
3154 // Keys are stored as their scalar text, not the Debug string.
3155 assert!(fm.extra.contains_key("2026"), "numeric key text lost");
3156 assert!(fm.extra.contains_key("true"), "bool key text lost");
3157 assert!(fm.extra.contains_key("3.14"), "float key text lost");
3158 assert!(!fm.extra.keys().any(|k| k.starts_with("Number(")));
3159 assert!(!fm.extra.keys().any(|k| k.starts_with("Bool(")));
3160
3161 // And a re-emit never produces the Debug forms on disk.
3162 let out = fm.to_yaml();
3163 assert!(!out.contains("Number("), "Debug-form key emitted:\n{out}");
3164 assert!(!out.contains("Bool("), "Debug-form key emitted:\n{out}");
3165 // The key text is still present (quoted, since it now reads as a string).
3166 assert!(out.contains("2026"), "numeric key dropped:\n{out}");
3167 assert!(out.contains("planning notes"), "value dropped:\n{out}");
3168 }
3169
3170 // ── Regression: universal-key sequence/mapping values are preserved (#2) ───
3171
3172 #[test]
3173 fn regression_universal_key_non_scalar_value_is_preserved_not_deleted() {
3174 // A universal key carrying a sequence/mapping (`status: [active, draft]`)
3175 // is not a valid scalar for that field. Before the fix, the matched arm
3176 // consumed-and-dropped it (scalar_string -> None) and `to_yaml` then
3177 // omitted the field — `dbmd format` silently DELETED it. It must now pass
3178 // through `extra` and re-emit verbatim.
3179 let yaml = "type: note\nstatus:\n - active\n - draft\nsummary:\n a: 1\n b: 2\n";
3180 let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
3181 // The typed accessors stay None (no valid scalar), but the data lives in
3182 // extra so nothing is lost.
3183 assert!(fm.status.is_none());
3184 assert!(fm.summary.is_none());
3185 assert!(fm.extra.contains_key("status"), "status value destroyed");
3186 assert!(fm.extra.contains_key("summary"), "summary value destroyed");
3187
3188 // A re-emit keeps both fields' data on disk.
3189 let out = fm.to_yaml();
3190 assert!(out.contains("status"), "status deleted on re-emit:\n{out}");
3191 assert!(out.contains("active"), "status items deleted:\n{out}");
3192 assert!(
3193 out.contains("summary"),
3194 "summary deleted on re-emit:\n{out}"
3195 );
3196
3197 // Round-trips as a fixed point — repeated curator-loop writes don't lose
3198 // the data.
3199 let reparsed = Frontmatter::parse(&out, Path::new("x.md")).unwrap();
3200 assert!(reparsed.extra.contains_key("status"));
3201 assert!(reparsed.extra.contains_key("summary"));
3202 }
3203
3204 // ── Regression: non-scalar tags items don't erase the tags field (#5) ──────
3205
3206 #[test]
3207 fn regression_non_scalar_tags_value_is_preserved_not_erased() {
3208 // `tags: [[vip]]` (an authoring slip — wiki-link brackets around a tag)
3209 // parses to a nested sequence; before the fix `parse_tags` filtered the
3210 // non-scalar item out and `to_yaml` then omitted the now-empty tags vec,
3211 // silently DELETING the tags line. It must now survive the re-emit (the
3212 // key data is preserved; the field is never dropped).
3213 let yaml = "type: note\ntags: [[vip]]\n";
3214 let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
3215 // The typed tags vec is empty (no clean scalar list), but the raw value
3216 // is preserved in extra so nothing is destroyed.
3217 assert!(fm.tags.is_empty());
3218 assert!(fm.extra.contains_key("tags"), "tags value destroyed");
3219
3220 let out = fm.to_yaml();
3221 assert!(out.contains("tags"), "tags deleted on re-emit:\n{out}");
3222 // The `vip` text survives on disk in some form (never erased).
3223 assert!(out.contains("vip"), "tag content erased:\n{out}");
3224
3225 // A clean tag list still parses to the typed vec (not regressed).
3226 let clean =
3227 Frontmatter::parse("type: note\ntags: [vip, renewal]\n", Path::new("x.md")).unwrap();
3228 assert_eq!(clean.tags, vec!["vip".to_string(), "renewal".to_string()]);
3229 assert!(!clean.extra.contains_key("tags"));
3230 }
3231
3232 // ── Regression: plain nested string lists are NOT fabricated into links (#3) ─
3233
3234 #[test]
3235 fn regression_plain_nested_string_list_is_not_turned_into_wiki_links() {
3236 // `groups: [[alpha], [beta]]` is the data [["alpha"],["beta"]] — an
3237 // unknown nested string list that must pass through verbatim. Before the
3238 // fix, canonicalize_extra_value fabricated `- '[[alpha]]'` / `- '[[beta]]'`
3239 // (short-form links the tool then flagged), changing the field's type.
3240 let yaml = "type: note\ngroups: [[alpha], [beta]]\n";
3241 let fm = Frontmatter::parse(yaml, Path::new("x.md")).unwrap();
3242 let before = fm.extra.get("groups").cloned();
3243
3244 let out = fm.to_yaml();
3245 // No fabricated wiki-link brackets in the emitted YAML.
3246 assert!(!out.contains("[[alpha]]"), "fabricated a wiki-link:\n{out}");
3247 assert!(!out.contains("[[beta]]"), "fabricated a wiki-link:\n{out}");
3248
3249 // The value is unchanged across the canonical re-emit.
3250 let reparsed = Frontmatter::parse(&out, Path::new("x.md")).unwrap();
3251 assert_eq!(
3252 reparsed.extra.get("groups"),
3253 before.as_ref(),
3254 "nested string list mutated by canonicalize_extra_value"
3255 );
3256 // And it surfaces no links.
3257 assert!(reparsed.link_fields().is_empty());
3258 }
3259
3260 // ── Regression: fence-line trailing whitespace is tolerated (#4) ───────────
3261
3262 #[test]
3263 fn regression_split_frontmatter_tolerates_trailing_whitespace_on_fences() {
3264 // A fence written `--- ` (trailing space — invisible in editors) is
3265 // indexed/validated clean by index.rs/validate.rs (both use `trim_end()`)
3266 // but, before the fix, hard-failed every read/edit surface routed through
3267 // `split_frontmatter`. All three must now agree.
3268 let text = "--- \ntype: note\nsummary: x\n---\t\nbody\n";
3269 let parsed = split_frontmatter(text, Path::new("f.md")).unwrap();
3270 assert_eq!(parsed.frontmatter_yaml, "type: note\nsummary: x\n");
3271 assert_eq!(parsed.body, "body\n");
3272
3273 // End to end through read_file's parse.
3274 let fm = Frontmatter::parse(&parsed.frontmatter_yaml, Path::new("f.md")).unwrap();
3275 assert_eq!(fm.type_.as_deref(), Some("note"));
3276 }
3277
3278 // ── Regression: CommonMark trailing-'#' heading rule (#6) ──────────────────
3279
3280 #[test]
3281 fn regression_heading_text_keeps_abutting_hash_drops_closing_sequence() {
3282 // `## C#` → `C#` (the `#` abuts content, not a closing sequence).
3283 assert_eq!(heading_text("## C#", 2), "C#");
3284 assert_eq!(heading_text("## F#", 2), "F#");
3285 assert_eq!(heading_text("## issue-123#", 2), "issue-123#");
3286 // A genuine ATX closing sequence (space before the `#` run) is dropped.
3287 assert_eq!(heading_text("## Title ##", 2), "Title");
3288 assert_eq!(heading_text("## Title #", 2), "Title");
3289 // All-hashes content collapses to empty.
3290 assert_eq!(heading_text("## ##", 2), "");
3291 // No trailing hashes — unchanged.
3292 assert_eq!(heading_text("## Plain", 2), "Plain");
3293 }
3294
3295 #[test]
3296 fn regression_extract_sections_keeps_csharp_heading_and_schema_type_binds() {
3297 // `dbmd sections` must report `C#`, not `C`.
3298 let secs = extract_sections("## C#\nbody\n");
3299 assert_eq!(secs.len(), 1);
3300 assert_eq!(secs[0].heading, "C#");
3301
3302 // And a `### c#` schema must register under `c#`, not `c`.
3303 let db = "---\ntype: db-md\n---\n\n## Schemas\n\n### c#\n- name (required)\n";
3304 let config = parse_db_md(db, Path::new("DB.md")).unwrap();
3305 assert!(
3306 config.schemas.contains_key("c#"),
3307 "schema bound to wrong key"
3308 );
3309 assert!(!config.schemas.contains_key("c"));
3310 }
3311
3312 // ── Regression: section line numbers offset by the frontmatter block (#7) ──
3313
3314 #[test]
3315 fn regression_extract_sections_in_file_reports_source_line_numbers() {
3316 // A heading on file line 6 (after a 4-line frontmatter block + 1 body
3317 // line) must be reported as L6, not the body-relative L2.
3318 let text = "---\ntype: note\nsummary: x\n---\nbody line\n## Heading\nmore\n";
3319 let secs = extract_sections_in_file(text);
3320 assert_eq!(secs.len(), 1);
3321 assert_eq!(secs[0].heading, "Heading");
3322 assert_eq!(secs[0].line, 6, "section line not offset by frontmatter");
3323
3324 // The body-relative helper is unchanged (validate relies on that frame).
3325 let body_secs = extract_sections("body line\n## Heading\nmore\n");
3326 assert_eq!(body_secs[0].line, 2);
3327
3328 // No frontmatter: whole text is body, no offset.
3329 let plain = extract_sections_in_file("## Top\nx\n## Next\n");
3330 assert_eq!(plain[0].line, 1);
3331 assert_eq!(plain[1].line, 3);
3332 }
3333
3334 // ── Regression: colon-form schema field bullet parses modifiers (#8) ───────
3335
3336 #[test]
3337 fn regression_colon_form_field_bullet_parses_modifiers() {
3338 // `- title: string, required` is the natural mis-spelling of
3339 // `- title (string, required)`; before the fix the whole text became the
3340 // field name and every modifier was silently lost.
3341 let f = parse_field_spec("- title: string, required");
3342 assert_eq!(f.name, "title");
3343 assert!(f.required, "required modifier lost on colon-form");
3344 assert_eq!(f.shape, Some(Shape::String));
3345
3346 // Through the schema-bullet classifier (the real path), it is a Field.
3347 match parse_schema_bullet("- title: string, required") {
3348 SchemaBullet::Field(f) => {
3349 assert_eq!(f.name, "title");
3350 assert!(f.required);
3351 assert_eq!(f.shape, Some(Shape::String));
3352 }
3353 other => panic!("expected Field, got {other:?}"),
3354 }
3355
3356 // A paren form whose modifiers contain a colon still parses by parens.
3357 let g = parse_field_spec("- status (enum: open, closed)");
3358 assert_eq!(g.name, "status");
3359 assert_eq!(
3360 g.enum_values,
3361 Some(vec!["open".to_string(), "closed".to_string()])
3362 );
3363 }
3364
3365 // ── Regression: comma inside a `default` value is preserved (#9) ───────────
3366
3367 #[test]
3368 fn regression_default_value_preserves_internal_commas() {
3369 let f = parse_field_spec("- title (default Director, Operations)");
3370 assert_eq!(
3371 f.default,
3372 Some(Value::String("Director, Operations".into())),
3373 "comma-bearing default truncated"
3374 );
3375
3376 let g = parse_field_spec("- region (default North America, EMEA fallback)");
3377 assert_eq!(
3378 g.default,
3379 Some(Value::String("North America, EMEA fallback".into()))
3380 );
3381
3382 // A single-token default still works (no regression).
3383 let h = parse_field_spec("- currency (default USD)");
3384 assert_eq!(h.default, Some(Value::String("USD".into())));
3385 }
3386
3387 // ── Regression: a `default` after `enum` is parsed, not swallowed (#10) ────
3388
3389 #[test]
3390 fn regression_default_after_enum_is_parsed_not_an_enum_member() {
3391 let f = parse_field_spec("- status (enum: open, closed, default open)");
3392 assert_eq!(
3393 f.enum_values,
3394 Some(vec!["open".to_string(), "closed".to_string()]),
3395 "`default open` leaked into the enum list"
3396 );
3397 assert_eq!(
3398 f.default,
3399 Some(Value::String("open".into())),
3400 "default after enum was dropped"
3401 );
3402
3403 // The bare `enum` keyword form, with a trailing default.
3404 let g = parse_field_spec("- status (enum, open, closed, default open)");
3405 assert_eq!(
3406 g.enum_values,
3407 Some(vec!["open".to_string(), "closed".to_string()])
3408 );
3409 assert_eq!(g.default, Some(Value::String("open".into())));
3410 }
3411
3412 // ── Regression: frozen-page policy does not fail open (#11) ────────────────
3413
3414 #[test]
3415 fn regression_frozen_match_handles_leading_slash() {
3416 let cfg = Config {
3417 frozen_pages: vec![PathBuf::from("/records/decisions/q1.md")],
3418 ..Config::default()
3419 };
3420 assert!(
3421 cfg.is_frozen(Path::new("records/decisions/q1.md")),
3422 "leading-slash entry failed open"
3423 );
3424 assert!(cfg.is_frozen(Path::new("records/decisions/q1")));
3425 }
3426
3427 #[test]
3428 fn regression_frozen_match_supports_globs() {
3429 let cfg = Config {
3430 frozen_pages: vec![PathBuf::from("records/decisions/*")],
3431 ..Config::default()
3432 };
3433 assert!(
3434 cfg.is_frozen(Path::new("records/decisions/q1.md")),
3435 "glob entry failed to protect a concrete file"
3436 );
3437 assert!(cfg.is_frozen(Path::new("records/decisions/q2.md")));
3438 // The glob does not cross a `/` segment.
3439 assert!(!cfg.is_frozen(Path::new("records/decisions/sub/q1.md")));
3440 // `**` crosses segments.
3441 let deep = Config {
3442 frozen_pages: vec![PathBuf::from("records/**")],
3443 ..Config::default()
3444 };
3445 assert!(deep.is_frozen(Path::new("records/decisions/sub/q1.md")));
3446 assert!(deep.is_frozen(Path::new("records/x.md")));
3447 assert!(!deep.is_frozen(Path::new("wiki/x.md")));
3448 // A `*.md`-style intra-segment glob.
3449 let suffix = Config {
3450 frozen_pages: vec![PathBuf::from("records/decisions/q*")],
3451 ..Config::default()
3452 };
3453 assert!(suffix.is_frozen(Path::new("records/decisions/q1.md")));
3454 assert!(!suffix.is_frozen(Path::new("records/decisions/draft.md")));
3455 }
3456
3457 #[test]
3458 fn regression_frozen_entry_single_hyphen_comment_is_stripped() {
3459 // `records/decisions/q3.md - finalized` (single ASCII hyphen comment, no
3460 // backticks): the comment must be stripped so the entry is just the path.
3461 let path = extract_path_bullet("- records/decisions/q3.md - finalized");
3462 assert_eq!(path, "records/decisions/q3.md");
3463
3464 // End to end: such a bullet freezes the file.
3465 let cfg = Config {
3466 frozen_pages: vec![PathBuf::from(extract_path_bullet(
3467 "- records/decisions/q3.md - finalized",
3468 ))],
3469 ..Config::default()
3470 };
3471 assert!(
3472 cfg.is_frozen(Path::new("records/decisions/q3.md")),
3473 "single-hyphen-comment entry failed open"
3474 );
3475 }
3476}