//! `parser` — read and write db.md markdown files.
//!
//! Parses the YAML frontmatter block, the markdown body, wiki-links, standard
//! markdown links, `##` sections, and the structured sections of the `DB.md`
//! config file. Also the atomic writer that round-trips a file while
//! preserving the operator-edited body verbatim and emitting frontmatter in
//! canonical key order.
//!
//! Strict on required fields, lenient on unknowns: any frontmatter key the
//! spec doesn't recognize is preserved in [`Frontmatter::extra`] as ambient
//! context and round-tripped untouched.
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use chrono::{DateTime, FixedOffset};
use serde_yml::{Mapping, Value};
/// The three canonical layer folder names. A path is "content" / a wiki-link is
/// "full-path" only when it resolves under one of these.
const LAYER_DIRS: [&str; 3] = ["sources", "records", "wiki"];
/// Errors produced while parsing a markdown file or the `DB.md` config.
#[derive(Debug, thiserror::Error)]
pub enum ParseError {
/// The frontmatter block was not valid YAML. Maps to validate code
/// `FM_MALFORMED_YAML`.
#[error("malformed YAML frontmatter in {file}: {source}")]
MalformedYaml {
/// The file whose frontmatter failed to parse.
file: PathBuf,
/// The underlying YAML error.
source: serde_yml::Error,
},
/// The file has no `---`-delimited frontmatter block at its very start.
#[error("missing frontmatter block in {file}")]
MissingFrontmatter {
/// The offending file.
file: PathBuf,
},
/// A required field was absent. Maps to validate code `FM_MISSING_TYPE`
/// (for `type`) and the per-type required-field codes.
#[error("missing required field '{key}' in {file}")]
MissingField {
/// The file missing the field.
file: PathBuf,
/// The required key.
key: String,
},
/// A timestamp field was not ISO-8601 / RFC3339. Maps to `FM_BAD_TIMESTAMP`.
#[error("bad timestamp in field '{key}' of {file}: {value}")]
BadTimestamp {
/// The file.
file: PathBuf,
/// The frontmatter key.
key: String,
/// The unparseable value.
value: String,
},
/// An I/O error reading the file.
#[error(transparent)]
Io(#[from] std::io::Error),
}
/// The parsed YAML frontmatter of a db.md file.
///
/// The universal-contract fields are typed accessors; everything else lands in
/// [`extra`](Frontmatter::extra) as ambient context (unknown-field passthrough)
/// and is round-tripped verbatim. The atomic writer re-emits keys in canonical
/// order: `type`, `id`, `created`, `updated`, `summary` first, then
/// type-specific fields, then `status` / `tags`.
#[derive(Debug, Clone, Default, PartialEq)]
pub struct Frontmatter {
/// `type` — required on content files; the primary query key.
pub type_: Option<String>,
/// `id` — optional; derived from the file path when absent.
pub id: Option<String>,
/// `created` — RFC3339; required and auto-set on content-file create.
pub created: Option<DateTime<FixedOffset>>,
/// `updated` — RFC3339; required and auto-maintained on content files.
pub updated: Option<DateTime<FixedOffset>>,
/// `summary` — the one-line catalog line; required on every content file.
pub summary: Option<String>,
/// `status` — optional lifecycle state.
pub status: Option<String>,
/// `tags` — optional flat list of short scalar labels.
pub tags: Vec<String>,
/// All other frontmatter keys (type-specific + custom), preserved verbatim
/// in insertion-stable sorted order. Wiki-link-valued fields keep their raw
/// YAML form here; [`Frontmatter::link_fields`] surfaces them as
/// [`WikiLink`]s.
pub extra: BTreeMap<String, Value>,
}
impl Frontmatter {
/// Parse a YAML frontmatter block (the text between the opening and closing
/// `---` fences, exclusive) into a [`Frontmatter`].
///
/// Lenient on unknown keys (they go to [`extra`](Frontmatter::extra));
/// returns [`ParseError::MalformedYaml`] only on YAML that doesn't parse.
pub fn parse(yaml: &str, file: &Path) -> Result<Self, ParseError> {
// An empty (or whitespace-only) frontmatter block is a valid, empty
// mapping — not a YAML error.
let value: Value = if yaml.trim().is_empty() {
Value::Mapping(Mapping::new())
} else {
serde_yml::from_str(yaml).map_err(|source| ParseError::MalformedYaml {
file: file.to_path_buf(),
source,
})?
};
// Top-level frontmatter must be a mapping. A scalar or sequence at the
// top level is malformed for our purposes; surface it as such.
let map = match value {
Value::Mapping(m) => m,
Value::Null => Mapping::new(),
other => {
// serde_yml::Error has no public constructor, so manufacture a
// representative one by deserializing the (sequence/scalar)
// value into a Mapping, which always fails with a type error.
let source = serde_yml::from_value::<Mapping>(other)
.expect_err("non-mapping frontmatter top level deserializes to Mapping");
return Err(ParseError::MalformedYaml {
file: file.to_path_buf(),
source,
});
}
};
let mut fm = Frontmatter::default();
for (k, v) in map {
let key = match k.as_str() {
Some(s) => s.to_string(),
// Non-string keys are unusual; stringify defensively and keep
// them in `extra` so nothing is silently dropped.
None => format!("{k:?}"),
};
match key.as_str() {
"type" => fm.type_ = v.as_str().map(str::to_string),
"id" => fm.id = v.as_str().map(str::to_string),
"created" => fm.created = parse_timestamp(&v, "created", file)?,
"updated" => fm.updated = parse_timestamp(&v, "updated", file)?,
"summary" => fm.summary = v.as_str().map(str::to_string),
"status" => fm.status = v.as_str().map(str::to_string),
"tags" => fm.tags = parse_tags(&v),
_ => {
fm.extra.insert(key, v);
}
}
}
Ok(fm)
}
/// Serialize the frontmatter back to a YAML block (no `---` fences) in
/// canonical key order. Round-trips [`extra`](Frontmatter::extra) verbatim.
pub fn to_yaml(&self) -> String {
// Build an order-preserving mapping in canonical key order:
// type, id, created, updated, summary (universal head)
// <type-specific extra, BTreeMap-sorted>
// status, tags (universal tail)
// serde_yml::Mapping preserves insertion order, so one serialize call
// emits the block in exactly this order with correct YAML quoting.
let mut map = Mapping::new();
if let Some(t) = &self.type_ {
map.insert(Value::String("type".into()), Value::String(t.clone()));
}
if let Some(id) = &self.id {
map.insert(Value::String("id".into()), Value::String(id.clone()));
}
if let Some(created) = &self.created {
map.insert(
Value::String("created".into()),
Value::String(created.to_rfc3339()),
);
}
if let Some(updated) = &self.updated {
map.insert(
Value::String("updated".into()),
Value::String(updated.to_rfc3339()),
);
}
if let Some(summary) = &self.summary {
map.insert(
Value::String("summary".into()),
Value::String(summary.clone()),
);
}
// Type-specific + custom fields, in BTreeMap (sorted) order. Each value
// is canonicalized so a wiki-link round-trips to the form the writer and
// `dbmd validate` agree on — critically, the SPEC-canonical *unquoted*
// scalar `field: [[x]]` (which YAML parses to a nested `Seq[Seq[String]]`)
// is re-emitted as a quoted scalar `'[[x]]'` instead of the bracket-less
// block sequence `- - x` that a verbatim re-emit would produce and that
// destroys the link. See [`canonicalize_extra_value`].
for (k, v) in &self.extra {
map.insert(Value::String(k.clone()), canonicalize_extra_value(v));
}
if let Some(status) = &self.status {
map.insert(
Value::String("status".into()),
Value::String(status.clone()),
);
}
if !self.tags.is_empty() {
map.insert(
Value::String("tags".into()),
Value::Sequence(self.tags.iter().cloned().map(Value::String).collect()),
);
}
if map.is_empty() {
return String::new();
}
serde_yml::to_string(&Value::Mapping(map)).unwrap_or_default()
}
/// True if the file is content (under `sources/`, `records/`, or `wiki/`)
/// and not an `index.md`. Used by validate to decide which files require a
/// `summary`. Meta files (`DB.md`, `index.md`, `log.md`) return false.
pub fn is_content_file(path: &Path) -> bool {
// index.md is a meta file at every level, never content.
if path.file_name().and_then(|n| n.to_str()) == Some("index.md") {
return false;
}
// Content iff some path component is one of the three layer dirs. This
// works for both store-relative (`sources/emails/x.md`) and absolute
// (`/home/db/sources/emails/x.md`) paths. DB.md / log.md sit at the
// root, under no layer, so they fall through to false.
path.components().any(|c| {
c.as_os_str()
.to_str()
.is_some_and(|s| LAYER_DIRS.contains(&s))
})
}
/// Resolve the file's effective `id`: the explicit `id` field if present,
/// otherwise derived from the store-relative path (filename without `.md`).
pub fn effective_id(&self, store_relative_path: &Path) -> String {
if let Some(id) = &self.id {
if !id.is_empty() {
return id.clone();
}
}
// Derived id = filename without the `.md` extension.
store_relative_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or_default()
.to_string()
}
/// Read a single frontmatter key as a raw YAML [`Value`], looking in the
/// typed fields first and then [`extra`](Frontmatter::extra).
pub fn get(&self, key: &str) -> Option<Value> {
match key {
"type" => self.type_.clone().map(Value::String),
"id" => self.id.clone().map(Value::String),
"created" => self.created.map(|d| Value::String(d.to_rfc3339())),
"updated" => self.updated.map(|d| Value::String(d.to_rfc3339())),
"summary" => self.summary.clone().map(Value::String),
"status" => self.status.clone().map(Value::String),
"tags" => {
if self.tags.is_empty() {
None
} else {
Some(Value::Sequence(
self.tags.iter().cloned().map(Value::String).collect(),
))
}
}
_ => self.extra.get(key).cloned(),
}
}
/// Set a single frontmatter key from a string value, routing universal-
/// contract keys to their typed fields and everything else to
/// [`extra`](Frontmatter::extra). Used by `dbmd fm set`.
pub fn set(&mut self, key: &str, value: &str) -> Result<(), ParseError> {
match key {
"type" => self.type_ = Some(value.to_string()),
"id" => self.id = Some(value.to_string()),
"created" => {
self.created = Some(parse_rfc3339(value, "created", Path::new("<fm set>"))?)
}
"updated" => {
self.updated = Some(parse_rfc3339(value, "updated", Path::new("<fm set>"))?)
}
"summary" => self.summary = Some(value.to_string()),
"status" => self.status = Some(value.to_string()),
"tags" => {
// Accept either a YAML flow list (`[a, b]`) or a single scalar
// tag. Anything that parses to a sequence becomes the tag list;
// otherwise the whole string is one tag.
self.tags = match serde_yml::from_str::<Value>(value) {
Ok(Value::Sequence(seq)) => parse_tags(&Value::Sequence(seq)),
_ => vec![value.to_string()],
};
}
_ => {
// A custom / type-specific field. The value is a scalar string by
// default, but the spec's list-valued link fields (e.g.
// `meeting.attendees`, SPEC § Linking) must serialize as a YAML
// block sequence of quoted wiki-links — never the flow-form string
// `"[[[a]], [[b]]]"`, which `dbmd validate` rejects as
// `WIKI_LINK_FLOW_FORM_LIST`. When the value parses as a YAML
// sequence whose every item is a clean single wiki-link, store the
// canonical sequence so `to_yaml` emits block form. Everything else
// — plain text, and a single inline `[[x]]` (which YAML reads as a
// nested `Seq[Seq[String]]`, not a list of link strings) — stays a
// verbatim scalar string, preserving the prior behavior.
let stored = parse_link_list_value(value)
.unwrap_or_else(|| Value::String(value.to_string()));
self.extra.insert(key.to_string(), stored);
}
}
Ok(())
}
/// Extract every frontmatter field whose value is a wiki-link (scalar
/// inline form or a block-sequence list), pairing each with its key. The
/// validate engine checks these against `(link)` schema annotations.
pub fn link_fields(&self) -> Vec<(String, WikiLink)> {
let mut out = Vec::new();
// `summary` may carry navigational wiki-links (spec encourages it).
if let Some(summary) = &self.summary {
for link in extract_wiki_links(summary, Path::new("")) {
out.push(("summary".to_string(), link));
}
}
// Every type-specific / custom field: a scalar wiki-link or a list of
// wiki-links, in either the quoted (`"[[x]]"`) or the canonical unquoted
// (`[[x]]`) form. See [`links_in_field_value`] for the YAML shapes.
for (key, value) in &self.extra {
for link in links_in_field_value(value) {
out.push((key.clone(), link));
}
}
out
}
}
/// A wiki-link reference inside the store: `[[target]]` or `[[target|display]]`.
///
/// `target` is always recorded as written; [`is_full_path`](WikiLink::is_full_path)
/// flags whether it's a full store-relative path (the doctrine) versus a
/// short-form (a validation error).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct WikiLink {
/// The link target as written, without the `[[ ]]` and without `|display`.
pub target: String,
/// The optional `|display` text override.
pub display: Option<String>,
/// True when `target` is a full store-relative path (contains a `/` and
/// resolves under a known layer); false for short-form targets like
/// `sarah-chen` — which validate reports as `WIKI_LINK_SHORT_FORM`.
pub is_full_path: bool,
/// True when `target` carries a trailing `.md` extension — validate warns
/// `WIKI_LINK_HAS_EXTENSION`; the canonical writers emit the bare form.
pub has_md_extension: bool,
/// Where the link appears: `(file, line, col)`, 1-based line and column.
pub location: (PathBuf, u32, u32),
}
/// A standard markdown link `[text](url)` — an external reference, kept in a
/// stream separate from [`WikiLink`] so external targets are visible to the
/// toolkit without being conflated with in-store edges. Not graph-validated.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct MarkdownLink {
/// The link text inside `[ ]`.
pub text: String,
/// The URL or path inside `( )`.
pub url: String,
/// Where the link appears: `(file, line, col)`, 1-based.
pub location: (PathBuf, u32, u32),
}
/// A `##`/`###` section of a markdown body: the heading text plus the byte
/// slice of the body it spans (heading line through the line before the next
/// heading of equal-or-shallower depth).
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Section {
/// The heading text (without the leading `#`s).
pub heading: String,
/// Heading depth (number of leading `#`s).
pub level: u8,
/// The 1-based line where the heading appears.
pub line: u32,
/// The section body, from the heading line to the next sibling-or-shallower
/// heading (exclusive), as a slice of the original body.
pub body: String,
}
/// The parsed structured content of a store's `DB.md` config file.
///
/// All four parts are optional in the source; absent parts fall back to spec
/// defaults. Produced by [`parse_db_md`].
#[derive(Debug, Clone, Default, PartialEq)]
pub struct Config {
/// Body of the `## Agent instructions` section — free-form prose passed to
/// the agent's system prompt.
pub agent_instructions: Option<String>,
/// `## Policies` → `### Frozen pages`: store-relative paths the toolkit
/// refuses to write (`POLICY_FROZEN_PAGE`).
pub frozen_pages: Vec<PathBuf>,
/// `## Policies` → `### Ignored types`: type names the curator never
/// synthesizes (still readable as ambient context).
pub ignored_types: Vec<String>,
/// `## Schemas` → one entry per `### <type>` sub-section.
pub schemas: BTreeMap<String, Schema>,
}
impl Config {
/// The `### Frozen pages` entry that matches a store-relative `target`, if
/// any. The **single** frozen-page matcher every write surface must funnel
/// through so the policy is enforced identically on `write` / `fm set` /
/// `fm init` / `link` / `rename` / `format`.
///
/// Comparison is normalized so a policy line and a write target match
/// regardless of incidental spelling differences:
/// - `/` path separators on every OS,
/// - a single leading `./` dropped,
/// - a trailing `.md` dropped on **both** sides — `parse_db_md` stores
/// frozen entries verbatim, so an operator who writes the natural
/// extensionless spelling (`records/decisions/q1`) must protect the file
/// (`records/decisions/q1.md`) exactly as the `.md` spelling does.
///
/// Returns the matched config entry verbatim (its original spelling) so the
/// caller can name it in the `POLICY_FROZEN_PAGE` refusal.
pub fn frozen_match(&self, target: &Path) -> Option<PathBuf> {
let want = normalize_frozen_path(target);
self.frozen_pages
.iter()
.find(|frozen| normalize_frozen_path(frozen) == want)
.cloned()
}
/// True if `target` (store-relative) is a frozen page. Convenience wrapper
/// over [`Config::frozen_match`] for callers that only need presence.
pub fn is_frozen(&self, target: &Path) -> bool {
self.frozen_match(target).is_some()
}
}
/// Normalize a path for frozen-page comparison: `/` separators, a single
/// leading `./` dropped, and a trailing `.md` dropped. Both the policy entry
/// and the write target pass through this before equality, so the match is
/// separator-, `./`-, and `.md`-insensitive.
fn normalize_frozen_path(p: &Path) -> String {
let unix: String = p
.components()
.filter_map(|c| c.as_os_str().to_str())
.collect::<Vec<_>>()
.join("/");
let no_dot = unix.strip_prefix("./").unwrap_or(&unix);
no_dot.strip_suffix(".md").unwrap_or(no_dot).to_string()
}
/// A custom (or canonical-override) type schema parsed from a `DB.md`
/// `### <type>` sub-section.
#[derive(Debug, Clone, Default, PartialEq)]
pub struct Schema {
/// One [`FieldSpec`] per bulleted field line, in source order.
pub fields: Vec<FieldSpec>,
}
/// One field declaration inside a [`Schema`]: `- <name> (<modifiers>)`.
///
/// Modifiers are comma-separated inside the parens; this captures the
/// recognized ones as typed fields and stashes anything unrecognized in
/// [`unknown_modifiers`](FieldSpec::unknown_modifiers) (surfaced as `Info`).
#[derive(Debug, Clone, Default, PartialEq)]
pub struct FieldSpec {
/// The field name.
pub name: String,
/// `required` modifier present.
pub required: bool,
/// The shape modifier (`string`/`int`/`bool`/`date`/`email`/`currency`/
/// `url`), if any.
pub shape: Option<Shape>,
/// `link to <prefix>/` — the store-relative prefix a wiki-link target must
/// start with. The trailing slash is required in the source syntax.
pub link_prefix: Option<PathBuf>,
/// `default <value>` — the value written when the field is absent.
pub default: Option<Value>,
/// `enum: <v1>, <v2>, ...` — the allowed values (must be the last modifier
/// on the line because of its own commas).
pub enum_values: Option<Vec<String>>,
/// Any modifiers not in the recognized vocabulary, preserved verbatim;
/// validate surfaces these as `Info`, never errors.
pub unknown_modifiers: Vec<String>,
}
/// A recognized shape modifier for a schema field. Validate enforces the
/// corresponding value shape (`SCHEMA_SHAPE_MISMATCH` on violation).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Shape {
/// Any scalar string.
String,
/// Integer.
Int,
/// Boolean.
Bool,
/// RFC3339 / ISO-8601 date.
Date,
/// `<local>@<domain>` email address.
Email,
/// A currency amount.
Currency,
/// A URL.
Url,
}
/// The result of splitting a raw file into its frontmatter block and body.
///
/// `body` is the verbatim remainder after the closing `---` fence — the writer
/// preserves it byte-for-byte so operator edits are never reflowed.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParsedFile {
/// The raw frontmatter YAML (between the fences, exclusive of them).
pub frontmatter_yaml: String,
/// The verbatim body (everything after the closing `---`).
pub body: String,
}
/// Split a file's full text into its frontmatter block and body. The
/// frontmatter block must be the very first thing in the file, delimited by
/// `---` on its own line at start and end. Returns
/// [`ParseError::MissingFrontmatter`] if absent.
pub fn split_frontmatter(text: &str, file: &Path) -> Result<ParsedFile, ParseError> {
// The opening fence must be the very first line: `---` (optionally with a
// trailing CR), no leading whitespace, nothing before it.
let mut lines = text.split_inclusive('\n');
let first = lines.next().unwrap_or("");
if first.trim_end_matches(['\r', '\n']) != "---" {
return Err(ParseError::MissingFrontmatter {
file: file.to_path_buf(),
});
}
// Scan for the closing fence line. Track byte offsets so we can slice the
// YAML (between fences, exclusive) and the body (verbatim, after the
// closing fence's line terminator).
let opening_len = first.len();
let mut offset = opening_len;
for line in lines {
if line.trim_end_matches(['\r', '\n']) == "---" {
let yaml = &text[opening_len..offset];
let body_start = offset + line.len();
let body = &text[body_start..];
return Ok(ParsedFile {
frontmatter_yaml: yaml.to_string(),
body: body.to_string(),
});
}
offset += line.len();
}
// Opening fence present but no closing fence: malformed frontmatter block.
Err(ParseError::MissingFrontmatter {
file: file.to_path_buf(),
})
}
/// Read a file from disk and parse it into typed [`Frontmatter`] plus the
/// verbatim body string.
pub fn read_file(path: &Path) -> Result<(Frontmatter, String), ParseError> {
let text = std::fs::read_to_string(path)?;
let parsed = split_frontmatter(&text, path)?;
let fm = Frontmatter::parse(&parsed.frontmatter_yaml, path)?;
Ok((fm, parsed.body))
}
/// Atomically write a markdown file from frontmatter + body: emit the
/// frontmatter in canonical key order, then the body verbatim, via a
/// temp-file-rename so a reader never sees a half-written file. Preserves the
/// operator-edited body exactly as given.
pub fn write_file(path: &Path, frontmatter: &Frontmatter, body: &str) -> Result<(), ParseError> {
use std::io::Write;
let yaml = frontmatter.to_yaml();
// `to_yaml` already terminates each block with a newline. Compose the file
// as: opening fence, frontmatter YAML, closing fence, then body verbatim.
let mut contents = String::with_capacity(yaml.len() + body.len() + 8);
contents.push_str("---\n");
contents.push_str(&yaml);
contents.push_str("---\n");
contents.push_str(body);
// Atomic write: write to a sibling temp file in the same directory, then
// rename over the target. Same-dir rename is atomic on a single
// filesystem, so a concurrent reader never sees a half-written file.
let parent = path.parent().unwrap_or_else(|| Path::new("."));
std::fs::create_dir_all(parent)?;
let file_name = path
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("dbmd-write");
let tmp = parent.join(format!(".{file_name}.tmp.{}", std::process::id()));
// Scope the handle so it is flushed and closed before the rename.
{
let mut f = std::fs::File::create(&tmp)?;
f.write_all(contents.as_bytes())?;
f.sync_all()?;
}
// On failure, clean up the temp file rather than leaking it.
if let Err(e) = std::fs::rename(&tmp, path) {
let _ = std::fs::remove_file(&tmp);
return Err(ParseError::Io(e));
}
Ok(())
}
/// Extract every wiki-link from a body (and inline frontmatter), returning the
/// structured [`WikiLink`] stream with short-form / `.md`-extension flags and
/// `(file, line, col)` locations set.
pub fn extract_wiki_links(body: &str, file: &Path) -> Vec<WikiLink> {
static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
let re = RE.get_or_init(|| {
// [[target]] or [[target|display]]; target/display exclude brackets and
// (for target) the `|` separator so nested forms don't over-match.
regex::Regex::new(r"\[\[([^\[\]|]+?)(?:\|([^\[\]]*))?\]\]").expect("valid wiki-link regex")
});
let mut out = Vec::new();
for (line_idx, line) in body.lines().enumerate() {
for caps in re.captures_iter(line) {
let whole = caps.get(0).expect("group 0 always present");
let target = caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
let display = caps.get(2).map(|m| m.as_str().to_string());
out.push(WikiLink {
is_full_path: target_is_full_path(&target),
has_md_extension: target_has_md_extension(&target),
target,
display,
location: (
file.to_path_buf(),
(line_idx as u32) + 1,
char_column(line, whole.start()),
),
});
}
}
out
}
/// Extract every standard markdown link `[text](url)` from a body into a
/// separate stream, kept distinct from wiki-links.
pub fn extract_markdown_links(body: &str, file: &Path) -> Vec<MarkdownLink> {
static RE: std::sync::OnceLock<regex::Regex> = std::sync::OnceLock::new();
let re = RE.get_or_init(|| {
// [text](url). `text` excludes brackets so a wiki-link `[[x]]` (which
// has `]]`, not `](`) never matches; `url` excludes `)` and whitespace.
regex::Regex::new(r"\[([^\[\]]*)\]\(([^)\s]*)\)").expect("valid markdown-link regex")
});
let mut out = Vec::new();
for (line_idx, line) in body.lines().enumerate() {
for caps in re.captures_iter(line) {
let whole = caps.get(0).expect("group 0 always present");
out.push(MarkdownLink {
text: caps.get(1).map(|m| m.as_str()).unwrap_or("").to_string(),
url: caps.get(2).map(|m| m.as_str()).unwrap_or("").to_string(),
location: (
file.to_path_buf(),
(line_idx as u32) + 1,
char_column(line, whole.start()),
),
});
}
}
out
}
/// Detect the frontmatter wiki-link-list mis-encoding: a wiki-link *list*
/// written so YAML parses it as nested sequences instead of a clean list of
/// strings. Returns the offending keys so validate can emit
/// `WIKI_LINK_FLOW_FORM_LIST`.
///
/// The subtlety is that `[[x]]` is YAML for "a list containing `[x]`", so the
/// shapes nest:
///
/// - **Scalar inline** `company: [[records/x]]` → `Seq[ Seq[String] ]`
/// (double-nested). This is the spec's scalar wiki-link form — NOT flagged.
/// - **Flow list** `attendees: [[[a]], [[b]]]` → `Seq[ Seq[Seq[String]], … ]`
/// (triple-nested). The list mis-encoding — flagged.
/// - **Unquoted block list** (`- [[a]]` per line) → also triple-nested, so it
/// is flagged too; the canonical list form must quote each item
/// (`- "[[a]]"`), which parses to a clean `Seq[String, …]` and is NOT flagged.
///
/// So the discriminator is nesting depth: a *list* mis-encoding has at least one
/// item that is itself a sequence-of-sequences, whereas a scalar inline link's
/// single item is a sequence-of-scalars.
pub fn detect_flow_form_link_lists(frontmatter_yaml: &str) -> Vec<String> {
let value: Value = match serde_yml::from_str(frontmatter_yaml) {
Ok(v) => v,
// Malformed YAML is FM_MALFORMED_YAML's job, not ours; report nothing.
Err(_) => return Vec::new(),
};
let Value::Mapping(map) = value else {
return Vec::new();
};
let mut out = Vec::new();
for (k, v) in &map {
if let Value::Sequence(items) = v {
// Triple-nesting: some outer item is a sequence that itself holds a
// sequence. Scalar inline `[[x]]` is only double-nested, so it
// never matches.
let is_link_list = items.iter().any(|item| match item {
Value::Sequence(inner) => inner.iter().any(|x| matches!(x, Value::Sequence(_))),
_ => false,
});
if is_link_list {
if let Some(key) = k.as_str() {
out.push(key.to_string());
}
}
}
}
out
}
/// Extract the `##`/`###` sections of a markdown body into a flat list with
/// body slices.
pub fn extract_sections(body: &str) -> Vec<Section> {
// Keep each line's start so we can slice the body verbatim (exact newlines).
let lines: Vec<&str> = body.split_inclusive('\n').collect();
// First pass: classify heading levels (0 = not a heading), honoring fenced
// code blocks so a `## x` inside a ``` fence is not treated as a heading.
let mut levels: Vec<u8> = Vec::with_capacity(lines.len());
let mut fence: Option<(u8, usize)> = None;
for line in &lines {
let content = line.trim_end_matches(['\n', '\r']);
if let Some(f) = fence {
if is_closing_fence(content, f) {
fence = None;
}
levels.push(0);
continue;
}
if let Some(opened) = opening_fence(content) {
fence = Some(opened);
levels.push(0);
continue;
}
levels.push(heading_level(content));
}
// Second pass: emit `##`+ headings; each section body runs from its heading
// line to the next heading at an equal-or-shallower level (exclusive).
let mut sections = Vec::new();
for (i, &lvl) in levels.iter().enumerate() {
if lvl < 2 {
continue;
}
let heading_line = lines[i].trim_end_matches(['\n', '\r']);
let heading = heading_text(heading_line, lvl);
let mut end = lines.len();
for (j, &other) in levels.iter().enumerate().skip(i + 1) {
if other != 0 && other <= lvl {
end = j;
break;
}
}
sections.push(Section {
heading,
level: lvl,
line: (i + 1) as u32,
body: lines[i..end].concat(),
});
}
sections
}
/// Parse a store's `DB.md` file into a [`Config`]: the `## Agent instructions`
/// prose, `## Policies` (`### Frozen pages` + `### Ignored types`), and
/// `## Schemas` (`### <type>` field-bullet blocks). Unrecognized sections are
/// ignored; absent sections leave their [`Config`] fields at default.
pub fn parse_db_md(text: &str, file: &Path) -> Result<Config, ParseError> {
// The structured sections live in the body (after frontmatter). DB.md must
// still start with a valid `---` block (`type: db-md`); if it's missing we
// surface MissingFrontmatter like any other file.
let parsed = split_frontmatter(text, file)?;
let sections = extract_sections(&parsed.body);
let mut config = Config::default();
// Track which H2 region each H3 belongs to as we walk the flat list.
let mut current_h2: Option<String> = None;
for section in §ions {
match section.level {
2 => {
let name = section.heading.trim().to_ascii_lowercase();
current_h2 = Some(name.clone());
if name == "agent instructions" {
let prose = section_prose(§ion.body);
if !prose.is_empty() {
config.agent_instructions = Some(prose);
}
}
}
3 => {
let h2 = current_h2.as_deref().unwrap_or("");
let h3 = section.heading.trim().to_ascii_lowercase();
match (h2, h3.as_str()) {
("policies", "frozen pages") => {
config.frozen_pages = bullet_lines(§ion.body)
.into_iter()
.map(|b| PathBuf::from(extract_path_bullet(&b)))
.collect();
}
("policies", "ignored types") => {
config.ignored_types = bullet_lines(§ion.body)
.into_iter()
.flat_map(|b| extract_type_list_bullet(&b))
.collect();
}
("schemas", _) => {
// The H3 heading text (as written) is the type name.
let type_name = section.heading.trim().to_string();
let fields: Vec<FieldSpec> = bullet_lines(§ion.body)
.into_iter()
.map(|b| parse_field_spec(&b))
.collect();
config.schemas.insert(type_name, Schema { fields });
}
_ => {}
}
}
_ => {}
}
}
Ok(config)
}
/// Parse a single `## Schemas` field-bullet line — `- <name> (<modifiers>)` —
/// into a [`FieldSpec`], capturing recognized modifiers and stashing the rest
/// in [`FieldSpec::unknown_modifiers`].
pub fn parse_field_spec(bullet_line: &str) -> FieldSpec {
// Strip the leading bullet marker (`- ` / `* ` / `+ `) and surrounding ws.
let line = bullet_line.trim();
let line = line
.strip_prefix("- ")
.or_else(|| line.strip_prefix("* "))
.or_else(|| line.strip_prefix("+ "))
.or_else(|| line.strip_prefix('-'))
.unwrap_or(line)
.trim();
// Split `<name> (<modifiers>)`. A bullet without parens is a free-form
// optional field of any shape — name only, no modifiers.
let (name, modifiers) = match line.find('(') {
Some(open) => {
let name = line[..open].trim().to_string();
let after = &line[open + 1..];
let mods = match after.rfind(')') {
Some(close) => &after[..close],
None => after, // tolerate a missing close paren
};
(name, mods.trim())
}
None => (line.to_string(), ""),
};
let mut spec = FieldSpec {
name,
..FieldSpec::default()
};
if modifiers.is_empty() {
return spec;
}
// Modifiers are comma-separated. `enum:` is special: because its own value
// list contains commas, it must be last and swallows the remainder.
let raw: Vec<&str> = modifiers.split(',').collect();
let mut i = 0;
while i < raw.len() {
let token = raw[i].trim();
if token.is_empty() {
i += 1;
continue;
}
let lower = token.to_ascii_lowercase();
if lower == "required" {
spec.required = true;
} else if let Some(shape) = shape_from_str(&lower) {
spec.shape = Some(shape);
} else if let Some(rest) = lower.strip_prefix("link to ") {
// The trailing slash is required in the source; store the prefix
// without it so `Path::starts_with` comparisons are clean.
let prefix = token["link to ".len()..].trim().trim_end_matches('/');
let _ = rest; // lowercase form only used for the keyword match
spec.link_prefix = Some(PathBuf::from(prefix));
} else if let Some(_rest) = lower.strip_prefix("default ") {
// Value is everything after the keyword on this comma-token,
// preserving original case.
let value = token["default ".len()..].trim().to_string();
spec.default = Some(Value::String(value));
} else if lower.starts_with("enum:") || lower == "enum" {
// Rejoin this token and every remaining token to recover the full
// comma-separated value list.
let mut joined = raw[i..].join(",");
// Drop the leading `enum:` keyword (case-insensitive).
if let Some(colon) = joined.find(':') {
joined = joined[colon + 1..].to_string();
}
let values: Vec<String> = joined
.split(',')
.map(|v| v.trim().to_string())
.filter(|v| !v.is_empty())
.collect();
spec.enum_values = Some(values);
break; // enum consumed the rest of the line
} else {
// Unrecognized modifier — captured verbatim, surfaced as Info.
spec.unknown_modifiers.push(token.to_string());
}
i += 1;
}
spec
}
// ── Private helpers ─────────────────────────────────────────────────────────
/// Parse a frontmatter timestamp value into a `DateTime<FixedOffset>`. A `null`
/// is treated as absent; anything else must be an RFC3339 string.
fn parse_timestamp(
value: &Value,
key: &str,
file: &Path,
) -> Result<Option<DateTime<FixedOffset>>, ParseError> {
match value {
Value::Null => Ok(None),
Value::String(s) => parse_rfc3339(s, key, file).map(Some),
other => Err(ParseError::BadTimestamp {
file: file.to_path_buf(),
key: key.to_string(),
value: format!("{other:?}"),
}),
}
}
/// Parse an RFC3339 timestamp string, mapping failure to [`ParseError::BadTimestamp`].
fn parse_rfc3339(s: &str, key: &str, file: &Path) -> Result<DateTime<FixedOffset>, ParseError> {
DateTime::parse_from_rfc3339(s.trim()).map_err(|_| ParseError::BadTimestamp {
file: file.to_path_buf(),
key: key.to_string(),
value: s.to_string(),
})
}
/// Read a `tags` value into a flat `Vec<String>`. Accepts a sequence of scalars
/// (the canonical form) or a single scalar (coerced to a one-element list).
fn parse_tags(value: &Value) -> Vec<String> {
match value {
Value::Sequence(items) => items
.iter()
.filter_map(|v| match v {
Value::String(s) => Some(s.clone()),
Value::Number(n) => Some(n.to_string()),
Value::Bool(b) => Some(b.to_string()),
_ => None,
})
.collect(),
Value::String(s) => vec![s.clone()],
_ => Vec::new(),
}
}
/// Parse a single `[[target|display]]` string into a [`WikiLink`] with no
/// location, or `None` if the string is not a bare wiki-link. Used for
/// frontmatter-valued links where there is no body position to report.
fn parse_wiki_link_str(s: &str) -> Option<WikiLink> {
let s = s.trim();
let inner = s.strip_prefix("[[")?.strip_suffix("]]")?;
// Reject anything with further brackets (e.g. the nested flow-form item),
// which is not a clean single wiki-link.
if inner.contains('[') || inner.contains(']') {
return None;
}
let (target, display) = match inner.split_once('|') {
Some((t, d)) => (t.to_string(), Some(d.to_string())),
None => (inner.to_string(), None),
};
Some(WikiLink {
is_full_path: target_is_full_path(&target),
has_md_extension: target_has_md_extension(&target),
target,
display,
location: (PathBuf::new(), 0, 0),
})
}
/// Extract every wiki-link from a single frontmatter field value, accepting the
/// two canonical forms the spec defines (SPEC § Linking):
///
/// - a **scalar** wiki-link field, in either the quoted (`f: "[[x]]"`) or the
/// canonical unquoted inline (`f: [[x]]`) form, and
/// - a **list** field whose items are quoted wiki-link strings
/// (`- "[[x]]"`).
///
/// YAML eats the brackets of an unquoted `[[x]]`, leaving a flow-list-in-a-list,
/// so the parsed [`Value`] shapes are not what one would naively expect:
///
/// | source | parsed `Value` | here |
/// |--------------------------------|------------------------------------|------|
/// | `f: "[[x]]"` (quoted) | `String("[[x]]")` | link |
/// | `f: [[x]]` (unquoted) | `Seq[ Seq[String("x")] ]` | link |
/// | `f:`\n` - "[[x]]"`(quoted) | `Seq[ String("[[x]]"), … ]` | link |
/// | `f:`\n` - [[x]]` (unquoted) | `Seq[ Seq[Seq[String("x")]], … ]` | — |
///
/// The last row — an *unquoted list* — parses identically to the flow-form list
/// `f: [[a], [b]]` and is a mis-encoding the canonical writer never emits;
/// `dbmd validate` reports it as `WIKI_LINK_FLOW_FORM_LIST` (see
/// [`detect_flow_form_link_lists`]). It is deliberately NOT surfaced here, so an
/// edge enumerator only ever sees the valid canonical forms.
///
/// The unquoted scalar (`Seq[Seq[String]]`, one element) is told apart from a
/// plain one-item flow list (`f: [x]` → `Seq[String]`, one fewer nesting level)
/// by [`unquoted_inline_link`] requiring its argument to be a `Sequence`.
fn links_in_field_value(value: &Value) -> Vec<WikiLink> {
// Quoted scalar: `field: "[[x]]"`.
if let Value::String(s) = value {
return parse_wiki_link_str(s).into_iter().collect();
}
let Value::Sequence(items) = value else {
return Vec::new();
};
// Unquoted scalar inline form `field: [[x]]` → `Seq[ Seq[String(x)] ]`.
// (A quoted single-item list `["[[x]]"]` is `Seq[String]`, so its lone item
// is a `String`, not a `Sequence`, and falls through to the list path below.)
if items.len() == 1 {
if let Some(link) = unquoted_inline_link(&items[0]) {
return vec![link];
}
}
// Otherwise a list of quoted wiki-link strings; non-string items (the
// unquoted-list mis-encoding) are left for validate to flag.
items
.iter()
.filter_map(|item| parse_wiki_link_str(item.as_str()?))
.collect()
}
/// Canonicalize one `extra` frontmatter value for emission by [`Frontmatter::to_yaml`].
///
/// The read path ([`Frontmatter::parse`]) stores every unknown key's raw parsed
/// [`Value`] verbatim, so a SPEC-canonical *unquoted* inline scalar wiki-link
/// (`company: [[records/companies/northstar]]`) lands in `extra` as the nested
/// shape YAML produces for it — `Seq[ Seq[String("records/companies/northstar")] ]`.
/// Re-emitting that verbatim yields the block sequence
///
/// ```text
/// company:
/// - - records/companies/northstar
/// ```
///
/// which has lost the `[[ ]]` brackets entirely: the link is destroyed, and every
/// reader (validate, graph, backlinks) stops seeing the edge. This normalizes such
/// a value back into the canonical emitted form before it is written:
///
/// - a **scalar** wiki-link (quoted `String("[[x]]")` or unquoted `Seq[Seq[String]]`,
/// one element) → a quoted scalar `Value::String("[[x]]")`, which serde_yml emits
/// inline as `'[[x]]'` — the form the finding confirms survives a round-trip and
/// that [`links_in_field_value`] reads back as the same scalar link;
/// - a **list** of wiki-links (in any spelling [`links_in_field_value`] accepts) →
/// a block `Value::Sequence` of quoted-link strings (`- "[[x]]"`), matching the
/// `set` write-in path and the canonical list form;
/// - everything else → returned verbatim (the common no-op for non-link values).
///
/// `|display` is preserved in both link branches. This is the single point that
/// keeps all three curator-loop writers (`format`, `fm set`, `link`) from
/// corrupting a pre-existing canonical link, since they all funnel through
/// `to_yaml`.
fn canonicalize_extra_value(value: &Value) -> Value {
match value {
// Scalar wiki-link, quoted form: `field: "[[x]]"` → `String("[[x]]")`.
// Re-emit as a quoted scalar so it stays a string (never the brackets-as-
// YAML nested sequence). Non-link strings are returned untouched.
Value::String(s) => match parse_wiki_link_str(s) {
Some(link) => Value::String(wiki_link_literal(&link)),
None => value.clone(),
},
Value::Sequence(items) => {
// Scalar wiki-link, unquoted inline form: `field: [[x]]` parses to a
// one-element `Seq[ Seq[String(x)] ]`. Collapse back to the quoted
// scalar string so the link is preserved rather than block-emitted.
if items.len() == 1 {
if let Some(link) = unquoted_inline_link(&items[0]) {
return Value::String(wiki_link_literal(&link));
}
}
// List of wiki-links: re-emit as a block sequence of quoted-link
// strings, the canonical list form `to_yaml` renders block-style and
// `links_in_field_value` accepts. Only canonicalize when *every* item
// is a clean single wiki-link; a list with any non-link item is left
// verbatim so unrelated sequences (and the unquoted-list mis-encoding
// validate flags) are untouched.
let mut links = Vec::with_capacity(items.len());
for item in items {
match link_from_flow_list_item(item) {
Some(link) => links.push(link),
None => return value.clone(),
}
}
if links.is_empty() {
return value.clone();
}
Value::Sequence(
links
.iter()
.map(|l| Value::String(wiki_link_literal(l)))
.collect(),
)
}
// Mappings, scalars other than strings, nulls: nothing to canonicalize.
_ => value.clone(),
}
}
/// Render a [`WikiLink`] back to its `[[target]]` / `[[target|display]]` literal,
/// the inner form the canonical writer emits and `links_in_field_value` accepts.
fn wiki_link_literal(link: &WikiLink) -> String {
match &link.display {
Some(d) => format!("[[{}|{}]]", link.target, d),
None => format!("[[{}]]", link.target),
}
}
/// Recognize the inner token of an unquoted scalar `[[x]]`: after YAML strips the
/// outer brackets, the inner `[x]` is a single-element sequence `Seq[String(x)]`.
/// Reconstructs `[[x]]` (preserving any `|display`) and parses it, or returns
/// `None` when `v` is not that shape. Requiring a `Sequence` here is what keeps a
/// plain one-item flow list (`field: [x]` → `Seq[String]`, not `Seq[Seq[String]]`)
/// from being mistaken for a wiki-link.
fn unquoted_inline_link(v: &Value) -> Option<WikiLink> {
let Value::Sequence(items) = v else {
return None;
};
if items.len() != 1 {
return None;
}
let s = items[0].as_str()?;
// A clean unquoted wiki-link has no further brackets inside it.
if s.contains('[') || s.contains(']') {
return None;
}
parse_wiki_link_str(&format!("[[{s}]]"))
}
/// Decide whether a `dbmd fm set` / `--fm` value string is a **list of
/// wiki-links** that should be stored as a YAML block sequence, returning the
/// canonical `Value::Sequence` of quoted-link strings when so.
///
/// The value path of every write surface stringifies its argument; without this
/// a required list-of-links field (`meeting.attendees`) was unwritable in valid
/// form — passing `[[[a]], [[b]]]` stored a single scalar string that mis-parses
/// and trips `WIKI_LINK_FLOW_FORM_LIST` / `WIKI_LINK_BROKEN`. This recognizes the
/// two list spellings an agent naturally types and normalizes both to the block
/// form the canonical writer emits and `dbmd validate` accepts:
///
/// - flow list of quoted links — `["[[a]]", "[[b]]"]`
/// - flow list of unquoted links — `[[[a]], [[b]]]` (YAML: `Seq[Seq[String], …]`)
///
/// Returns `None` (⇒ caller stores a verbatim scalar string) for everything that
/// is not unambiguously a list of clean wiki-links — plain text, a single inline
/// `[[x]]` (YAML reads it as a one-item `Seq[Seq[String]]`, kept scalar so it
/// renders inline), an empty list, or a list with any non-link item. A single
/// link must stay scalar; only genuine multi-item-or-explicit lists become
/// sequences, matching `links_in_field_value`'s acceptance rule so writer and
/// validator never disagree.
fn parse_link_list_value(value: &str) -> Option<Value> {
let trimmed = value.trim();
// Only a YAML *flow sequence* literal is a list candidate; anything not
// wrapped in `[ … ]` is a scalar (a bare `[[x]]` is wrapped, and handled by
// the single-inline-link guard below).
if !(trimmed.starts_with('[') && trimmed.ends_with(']')) {
return None;
}
let Ok(Value::Sequence(items)) = serde_yml::from_str::<Value>(trimmed) else {
return None;
};
// A single inline `[[x]]` parses to `Seq[ Seq[String(x)] ]` (one item, itself
// a sequence) — that is the unquoted *scalar* form, not a list. Keep it scalar
// so it round-trips to the inline `field: [[x]]` rather than a one-item block
// list. `links_in_field_value` reads it back as a scalar link either way.
if items.len() == 1 && unquoted_inline_link(&items[0]).is_some() {
return None;
}
// Every item must resolve to exactly one clean wiki-link, in any of the flow
// spellings an agent types (see [`link_from_flow_list_item`]).
let mut links = Vec::with_capacity(items.len());
for item in &items {
links.push(link_from_flow_list_item(item)?);
}
if links.is_empty() {
return None;
}
// Normalize to a block sequence of quoted-link strings — the form `to_yaml`
// renders block-style and `links_in_field_value` accepts. `|display` is
// preserved.
let normalized = links
.iter()
.map(|l| Value::String(wiki_link_literal(l)))
.collect();
Some(Value::Sequence(normalized))
}
/// Recognize one clean wiki-link from a single **item** of a YAML flow sequence,
/// across the spellings an agent types for a list. After top-level flow parsing,
/// a list item arrives in one of:
///
/// - quoted — `"[[x]]"` ⇒ `String("[[x]]")`
/// - unquoted in a flow list — `[[x]]` inside `[…]` ⇒ `Seq[ Seq[String(x)] ]`
/// (one level deeper than a bare unquoted scalar, because the surrounding list
/// adds a wrapper); unwrap the single-element wrapper, then read the inline
/// `Seq[String(x)]` with [`unquoted_inline_link`].
///
/// Returns `None` for any item that is not exactly one clean wiki-link, so the
/// caller falls back to a scalar string and never fabricates a partial list.
fn link_from_flow_list_item(item: &Value) -> Option<WikiLink> {
match item {
Value::String(s) => parse_wiki_link_str(s),
Value::Sequence(inner) => {
// Unquoted list item `[[x]]` → `Seq[ Seq[String(x)] ]`: peel the lone
// wrapper to expose the inline-link shape.
if inner.len() == 1 {
if let Some(link) = unquoted_inline_link(&inner[0]) {
return Some(link);
}
}
// Defensive: also accept the inline-link shape directly.
unquoted_inline_link(item)
}
_ => None,
}
}
/// A target is a full store-relative path when its first path segment is one of
/// the three canonical layer dirs and at least one `/` separator follows. A
/// trailing `.md` does not affect this classification.
fn target_is_full_path(target: &str) -> bool {
let target = target.trim();
match target.split_once('/') {
Some((head, _rest)) => LAYER_DIRS.contains(&head),
None => false,
}
}
/// True when the target carries a trailing `.md` extension (validate warns
/// `WIKI_LINK_HAS_EXTENSION`).
fn target_has_md_extension(target: &str) -> bool {
target.trim().ends_with(".md")
}
/// 1-based character (Unicode scalar) column of `byte_offset` within `line`.
fn char_column(line: &str, byte_offset: usize) -> u32 {
(line[..byte_offset].chars().count() as u32) + 1
}
/// Map a lowercase shape keyword to its [`Shape`].
fn shape_from_str(s: &str) -> Option<Shape> {
match s {
"string" => Some(Shape::String),
"int" => Some(Shape::Int),
"bool" => Some(Shape::Bool),
"date" => Some(Shape::Date),
"email" => Some(Shape::Email),
"currency" => Some(Shape::Currency),
"url" => Some(Shape::Url),
_ => None,
}
}
/// The ATX heading level of a line (number of leading `#`), or 0 if not a
/// heading. Up to three leading spaces (CommonMark), requires a space/tab (or
/// end-of-line) after the `#` run, caps the run at six.
fn heading_level(line: &str) -> u8 {
let indent = line.len() - line.trim_start_matches(' ').len();
if indent > 3 {
return 0;
}
let rest = &line[indent..];
let hashes = rest.len() - rest.trim_start_matches('#').len();
if hashes == 0 || hashes > 6 {
return 0;
}
let after = &rest[hashes..];
if after.is_empty() || after.starts_with(' ') || after.starts_with('\t') {
hashes as u8
} else {
0
}
}
/// The heading text after the `#` run, trimmed, with any trailing ATX closing
/// `#` sequence removed (`## Title ##` → `Title`).
fn heading_text(line: &str, level: u8) -> String {
let indent = line.len() - line.trim_start_matches(' ').len();
let after_hashes = &line[indent + level as usize..];
let trimmed = after_hashes.trim();
let no_trailing = trimmed.trim_end_matches('#');
if no_trailing.len() == trimmed.len() {
trimmed.to_string()
} else {
no_trailing.trim_end().to_string()
}
}
/// If `line` opens a fenced code block, return `(fence byte, run length)`.
fn opening_fence(line: &str) -> Option<(u8, usize)> {
let indent = line.len() - line.trim_start_matches(' ').len();
if indent > 3 {
return None;
}
let rest = &line[indent..];
let byte = rest.bytes().next()?;
if byte != b'`' && byte != b'~' {
return None;
}
let run = rest.len() - rest.trim_start_matches(byte as char).len();
if run < 3 {
return None;
}
// A backtick fence's info string may not itself contain a backtick.
if byte == b'`' && rest[run..].contains('`') {
return None;
}
Some((byte, run))
}
/// True if `line` closes the currently open fence: same char, run at least as
/// long, nothing but trailing whitespace after.
fn is_closing_fence(line: &str, fence: (u8, usize)) -> bool {
let (byte, open_len) = fence;
let indent = line.len() - line.trim_start_matches(' ').len();
if indent > 3 {
return false;
}
let rest = &line[indent..];
let run = rest.len() - rest.trim_start_matches(byte as char).len();
if run < open_len {
return false;
}
rest[run..].trim().is_empty()
}
/// The prose body of a section: everything after the heading line, trimmed.
fn section_prose(section_body: &str) -> String {
match section_body.split_once('\n') {
Some((_heading, rest)) => rest.trim().to_string(),
None => String::new(),
}
}
/// The bullet lines (`-`/`*`/`+`) of a section body, excluding the heading
/// line, each returned with its leading whitespace trimmed.
fn bullet_lines(section_body: &str) -> Vec<String> {
section_body
.lines()
.skip(1) // the heading line
.map(str::trim)
.filter(|l| l.starts_with("- ") || l.starts_with("* ") || l.starts_with("+ "))
.map(|l| l.to_string())
.collect()
}
/// Cut a bullet's content at the first ` — ` / ` -- ` comment separator,
/// returning only the meaningful prefix.
fn strip_bullet_comment(content: &str) -> &str {
let mut cut = content.len();
for sep in [" — ", " -- ", " – "] {
if let Some(idx) = content.find(sep) {
cut = cut.min(idx);
}
}
content[..cut].trim()
}
/// Strip the leading bullet marker, returning the trimmed content after it.
fn bullet_content(bullet: &str) -> &str {
let t = bullet.trim();
t.strip_prefix("- ")
.or_else(|| t.strip_prefix("* "))
.or_else(|| t.strip_prefix("+ "))
.unwrap_or(t)
.trim()
}
/// Extract a store-relative path from a Frozen-pages bullet. The path may be
/// wrapped in backticks and followed by an em-dash comment.
fn extract_path_bullet(bullet: &str) -> String {
let content = bullet_content(bullet);
// Prefer a backtick-delimited span if present.
if let Some(start) = content.find('`') {
if let Some(end_rel) = content[start + 1..].find('`') {
return content[start + 1..start + 1 + end_rel].trim().to_string();
}
}
// Otherwise take the text up to a comment separator, stripping quotes.
strip_bullet_comment(content)
.trim_matches('"')
.trim_matches('\'')
.trim()
.to_string()
}
/// Extract a comma-separated type list from an Ignored-types bullet, stripping
/// backticks/quotes and any trailing em-dash comment.
fn extract_type_list_bullet(bullet: &str) -> Vec<String> {
let content = strip_bullet_comment(bullet_content(bullet));
content
.split(',')
.map(|t| {
t.trim()
.trim_matches('`')
.trim_matches('"')
.trim_matches('\'')
.trim()
.to_string()
})
.filter(|t| !t.is_empty())
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
use tempfile::tempdir;
// ── Config::frozen_match (the single write-surface policy matcher) ───────
#[test]
fn frozen_match_is_md_insensitive_both_directions() {
// A policy entry stored WITHOUT `.md` (the natural extensionless
// spelling `parse_db_md` keeps verbatim) must still match a `.md`
// write target — the regression every write surface had.
let cfg = Config {
frozen_pages: vec![PathBuf::from("records/decisions/q1")],
..Config::default()
};
assert_eq!(
cfg.frozen_match(Path::new("records/decisions/q1.md")),
Some(PathBuf::from("records/decisions/q1")),
"extensionless policy entry must freeze the .md file"
);
assert!(cfg.is_frozen(Path::new("records/decisions/q1.md")));
// The symmetric case: a policy entry WITH `.md` matches a bare target.
let cfg = Config {
frozen_pages: vec![PathBuf::from("records/decisions/q1.md")],
..Config::default()
};
assert_eq!(
cfg.frozen_match(Path::new("records/decisions/q1")),
Some(PathBuf::from("records/decisions/q1.md")),
);
// And the same-spelling cases still match.
assert!(cfg.is_frozen(Path::new("records/decisions/q1.md")));
}
#[test]
fn frozen_match_drops_leading_dot_slash() {
let cfg = Config {
frozen_pages: vec![PathBuf::from("records/decisions/q1.md")],
..Config::default()
};
assert!(cfg.is_frozen(Path::new("./records/decisions/q1.md")));
assert!(cfg.is_frozen(Path::new("./records/decisions/q1")));
}
#[test]
fn frozen_match_returns_none_for_unlisted_and_prefix_paths() {
let cfg = Config {
frozen_pages: vec![PathBuf::from("records/decisions/q1")],
..Config::default()
};
assert!(cfg
.frozen_match(Path::new("records/decisions/q2.md"))
.is_none());
// A prefix is not a match: `q1` must not freeze `q1-draft`.
assert!(cfg
.frozen_match(Path::new("records/decisions/q1-draft.md"))
.is_none());
assert!(!cfg.is_frozen(Path::new("records/decisions/q11.md")));
}
// ── split_frontmatter ───────────────────────────────────────────────────
#[test]
fn split_frontmatter_separates_yaml_and_verbatim_body() {
let text = "---\ntype: contact\nsummary: x\n---\n# Heading\n\nBody line.\n";
let p = split_frontmatter(text, Path::new("f.md")).unwrap();
assert_eq!(p.frontmatter_yaml, "type: contact\nsummary: x\n");
// Body is everything after the closing fence's newline, byte-for-byte.
assert_eq!(p.body, "# Heading\n\nBody line.\n");
}
#[test]
fn split_frontmatter_preserves_body_without_trailing_newline() {
let text = "---\ntype: x\n---\nno trailing newline";
let p = split_frontmatter(text, Path::new("f.md")).unwrap();
assert_eq!(p.body, "no trailing newline");
}
#[test]
fn split_frontmatter_empty_body_when_nothing_after_fence() {
let text = "---\ntype: x\n---\n";
let p = split_frontmatter(text, Path::new("f.md")).unwrap();
assert_eq!(p.body, "");
}
#[test]
fn split_frontmatter_missing_opening_fence_errors() {
let text = "# No frontmatter here\ntype: x\n";
let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
}
#[test]
fn split_frontmatter_leading_content_before_fence_rejected() {
// The opening fence must be the very first line; a blank line first is
// not allowed.
let text = "\n---\ntype: x\n---\nbody";
let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
}
#[test]
fn split_frontmatter_unterminated_block_errors() {
let text = "---\ntype: x\nsummary: y\n";
let err = split_frontmatter(text, Path::new("f.md")).unwrap_err();
assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
}
// ── Frontmatter::parse ───────────────────────────────────────────────────
#[test]
fn parse_populates_typed_fields_and_routes_unknowns_to_extra() {
let yaml = "type: contact\nid: sarah-chen\nsummary: Director of Ops\nstatus: active\ntags: [vip, renewal]\nemail: sarah@northstar.io\nrole: Director";
let fm = Frontmatter::parse(yaml, Path::new("f.md")).unwrap();
assert_eq!(fm.type_.as_deref(), Some("contact"));
assert_eq!(fm.id.as_deref(), Some("sarah-chen"));
assert_eq!(fm.summary.as_deref(), Some("Director of Ops"));
assert_eq!(fm.status.as_deref(), Some("active"));
assert_eq!(fm.tags, vec!["vip".to_string(), "renewal".to_string()]);
// Type-specific fields are NOT promoted to typed slots.
assert!(fm.type_.is_some() && !fm.extra.contains_key("type"));
assert!(!fm.extra.contains_key("tags"));
assert_eq!(
fm.extra.get("email").and_then(|v| v.as_str()),
Some("sarah@northstar.io")
);
assert_eq!(
fm.extra.get("role").and_then(|v| v.as_str()),
Some("Director")
);
}
#[test]
fn parse_reads_rfc3339_timestamps() {
let yaml =
"type: email\ncreated: 2026-05-27T08:00:00-07:00\nupdated: 2026-05-28T09:30:00-07:00";
let fm = Frontmatter::parse(yaml, Path::new("f.md")).unwrap();
let created = fm.created.expect("created parsed");
// -07:00 offset is 7 * 3600 seconds west.
assert_eq!(created.offset().utc_minus_local(), 7 * 3600);
assert_eq!(created.to_rfc3339(), "2026-05-27T08:00:00-07:00");
assert!(fm.updated.is_some());
}
#[test]
fn parse_rejects_non_rfc3339_timestamp() {
// A date-only value is not a full RFC3339 timestamp; created/updated
// require the full form.
let yaml = "type: email\ncreated: 2026-05-27";
let err = Frontmatter::parse(yaml, Path::new("bad.md")).unwrap_err();
match err {
ParseError::BadTimestamp { key, value, .. } => {
assert_eq!(key, "created");
assert_eq!(value, "2026-05-27");
}
other => panic!("expected BadTimestamp, got {other:?}"),
}
}
#[test]
fn parse_malformed_yaml_errors() {
// Unclosed flow mapping is invalid YAML.
let yaml = "type: contact\n bad: : :\n- nope";
let err = Frontmatter::parse(yaml, Path::new("bad.md")).unwrap_err();
assert!(matches!(err, ParseError::MalformedYaml { .. }));
}
#[test]
fn parse_empty_block_is_empty_frontmatter() {
let fm = Frontmatter::parse("", Path::new("f.md")).unwrap();
assert_eq!(fm, Frontmatter::default());
}
#[test]
fn parse_scalar_top_level_is_malformed() {
// A bare scalar at the top level is not a frontmatter mapping.
let err = Frontmatter::parse("just a string", Path::new("f.md")).unwrap_err();
assert!(matches!(err, ParseError::MalformedYaml { .. }));
}
// ── to_yaml canonical order ──────────────────────────────────────────────
#[test]
fn to_yaml_emits_canonical_key_order() {
let mut fm = Frontmatter {
type_: Some("contact".into()),
id: Some("sarah-chen".into()),
summary: Some("Director of Ops".into()),
status: Some("active".into()),
tags: vec!["vip".into()],
created: Some(DateTime::parse_from_rfc3339("2026-05-27T08:00:00-07:00").unwrap()),
updated: Some(DateTime::parse_from_rfc3339("2026-05-28T09:30:00-07:00").unwrap()),
..Default::default()
};
// Two type-specific fields, inserted in NON-alphabetical order to prove
// the writer sorts them (BTreeMap) between the universal head and tail.
fm.extra
.insert("role".into(), Value::String("Director".into()));
fm.extra.insert(
"company".into(),
Value::String("[[records/companies/northstar]]".into()),
);
let yaml = fm.to_yaml();
let keys: Vec<&str> = yaml
.lines()
.filter(|l| !l.starts_with(['-', ' ']) && l.contains(':'))
.map(|l| l.split(':').next().unwrap())
.collect();
assert_eq!(
keys,
vec![
"type", "id", "created", "updated", "summary", // universal head
"company", "role", // type-specific, sorted
"status", // universal tail
"tags",
],
"canonical order violated; got:\n{yaml}"
);
// Timestamps round-trip as RFC3339 strings (YAML may quote them).
assert!(
yaml.contains("2026-05-27T08:00:00-07:00"),
"created timestamp missing; got:\n{yaml}"
);
// The value re-parses to the same instant regardless of quoting.
let reparsed = Frontmatter::parse(&yaml, Path::new("rt.md")).unwrap();
assert_eq!(reparsed.created, fm.created);
assert_eq!(reparsed.updated, fm.updated);
}
#[test]
fn to_yaml_omits_absent_optional_fields() {
let fm = Frontmatter {
type_: Some("note".into()),
..Default::default()
};
let yaml = fm.to_yaml();
assert!(yaml.contains("type: note"));
assert!(!yaml.contains("status"));
assert!(!yaml.contains("tags"));
assert!(!yaml.contains("summary"));
}
#[test]
fn to_yaml_preserves_unquoted_scalar_wiki_link_round_trip() {
// Regression (PRIMARY): the SPEC-canonical scalar wiki-link is the
// *unquoted* inline `company: [[records/companies/northstar]]`
// (SPEC § Linking, the worked `contact` example). YAML parses it to the
// nested `Seq[Seq[String]]` shape and `parse` stores that verbatim in
// `extra`. Before the fix, `to_yaml` re-emitted it block-style as
// company:
// - - records/companies/northstar
// — the `[[ ]]` brackets GONE — so a no-op re-emit (`dbmd format`, and
// any `fm set` / `link` write) silently destroyed the link.
let yaml = "type: contact\ncompany: [[records/companies/northstar]]";
let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
// Sanity: it really parsed as the nested sequence, not a string.
assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_none());
let out = fm.to_yaml();
// The link must survive as a quoted inline scalar — brackets intact, and
// never the bracket-less block sequence `- - records/...`.
assert!(
out.contains("[[records/companies/northstar]]"),
"canonical writer dropped the wiki-link brackets; got:\n{out}"
);
assert!(
!out.contains("- - "),
"canonical writer emitted a nested block sequence (link corrupted); got:\n{out}"
);
// And it round-trips: re-parsing the emitted YAML still surfaces exactly
// one link with the right target (the edge graph/backlinks rely on).
let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
let fields = reparsed.link_fields();
let links: Vec<(&str, &str, Option<&str>)> = fields
.iter()
.map(|(k, l)| (k.as_str(), l.target.as_str(), l.display.as_deref()))
.collect();
assert_eq!(
links,
vec![("company", "records/companies/northstar", None)]
);
// A second re-emit is a fixed point — no progressive corruption across
// repeated curator-loop writes.
assert_eq!(
reparsed.to_yaml(),
out,
"to_yaml is not idempotent on links"
);
}
#[test]
fn to_yaml_preserves_unquoted_scalar_link_with_display() {
// The `|display` segment must survive the unquoted-inline round-trip too.
let yaml = "type: contact\ncompany: [[records/companies/northstar|Northstar]]";
let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
let out = fm.to_yaml();
assert!(
out.contains("[[records/companies/northstar|Northstar]]"),
"display segment lost on round-trip; got:\n{out}"
);
let reparsed = Frontmatter::parse(&out, Path::new("c.md")).unwrap();
let f = reparsed.link_fields();
assert_eq!(f.len(), 1);
assert_eq!(f[0].1.target, "records/companies/northstar");
assert_eq!(f[0].1.display.as_deref(), Some("Northstar"));
}
#[test]
fn to_yaml_does_not_mangle_link_list_or_plain_nested_sequence() {
// A genuine quoted block list of links round-trips as a clean string
// list — never collapsed to a scalar — and a plain nested sequence that
// is NOT a wiki-link is left exactly as written (no false conversion).
let yaml = "type: meeting\nattendees:\n - \"[[records/contacts/elena]]\"\n - \"[[records/contacts/sarah]]\"\nmatrix:\n - - 1\n - 2";
let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
let out = fm.to_yaml();
// Both attendee links survive as quoted strings.
assert!(out.contains("[[records/contacts/elena]]"), "got:\n{out}");
assert!(out.contains("[[records/contacts/sarah]]"), "got:\n{out}");
let reparsed = Frontmatter::parse(&out, Path::new("m.md")).unwrap();
let fields = reparsed.link_fields();
let attendees: Vec<&str> = fields
.iter()
.filter(|(k, _)| k == "attendees")
.map(|(_, l)| l.target.as_str())
.collect();
assert_eq!(
attendees,
vec!["records/contacts/elena", "records/contacts/sarah"]
);
// The non-link nested sequence is preserved verbatim, not touched.
assert_eq!(reparsed.extra.get("matrix"), fm.extra.get("matrix"));
}
// ── read_file / write_file round-trip ────────────────────────────────────
#[test]
fn write_then_read_roundtrips_and_preserves_body_verbatim() {
let dir = tempdir().unwrap();
let path = dir.path().join("sources/emails/x.md");
let body = "# Subject\n\nHello,\n\nSee [[records/contacts/sarah-chen]].\n";
let mut fm = Frontmatter {
type_: Some("email".into()),
summary: Some("renewal note".into()),
created: Some(DateTime::parse_from_rfc3339("2026-05-27T08:00:00-07:00").unwrap()),
..Default::default()
};
fm.extra
.insert("from".into(), Value::String("elena@northstar.io".into()));
write_file(&path, &fm, body).unwrap();
let (read_fm, read_body) = read_file(&path).unwrap();
assert_eq!(read_body, body, "body must be preserved byte-for-byte");
assert_eq!(read_fm.type_.as_deref(), Some("email"));
assert_eq!(read_fm.summary.as_deref(), Some("renewal note"));
assert_eq!(
read_fm.extra.get("from").and_then(|v| v.as_str()),
Some("elena@northstar.io")
);
// The on-disk file starts with a fence and ends with the verbatim body.
let raw = std::fs::read_to_string(&path).unwrap();
assert!(raw.starts_with("---\n"));
assert!(raw.ends_with(body));
}
#[test]
fn roundtrip_modify_summary_then_write_changes_only_summary() {
let dir = tempdir().unwrap();
let path = dir.path().join("records/contacts/sarah.md");
let body = "Long-form operator notes about Sarah.\n";
let fm = Frontmatter {
type_: Some("contact".into()),
summary: Some("old summary".into()),
..Default::default()
};
write_file(&path, &fm, body).unwrap();
// Read → modify summary → write back.
let (mut fm2, body2) = read_file(&path).unwrap();
fm2.summary = Some("new summary".into());
write_file(&path, &fm2, &body2).unwrap();
let (fm3, body3) = read_file(&path).unwrap();
assert_eq!(fm3.summary.as_deref(), Some("new summary"));
assert_eq!(fm3.type_.as_deref(), Some("contact"));
assert_eq!(body3, body, "body unchanged across the round-trip");
}
#[test]
fn roundtrip_preserves_handwritten_unquoted_scalar_wiki_link_on_disk() {
// End-to-end analog of `dbmd format` on the verbatim SPEC worked example:
// a hand-written file carrying the canonical UNQUOTED scalar link
// `company: [[records/companies/northstar]]`, read from disk then written
// back unchanged. Before the fix this no-op re-emit rewrote the on-disk
// value to the bracket-less block sequence `company:\n- - records/...`,
// and every reader (validate/graph/backlinks) then lost the edge.
let dir = tempdir().unwrap();
let path = dir.path().join("records/contacts/sarah-chen.md");
let file = "---\ntype: contact\nid: sarah-chen\nsummary: Director of Ops\ncompany: [[records/companies/northstar]]\n---\n# Sarah Chen\n\nNotes.\n";
std::fs::create_dir_all(path.parent().unwrap()).unwrap();
std::fs::write(&path, file).unwrap();
// Read → write back unchanged (the canonical no-op re-emit).
let (fm, body) = read_file(&path).unwrap();
write_file(&path, &fm, &body).unwrap();
// On-disk bytes still carry the bracketed link, never `- - records/...`.
let raw = std::fs::read_to_string(&path).unwrap();
assert!(
raw.contains("[[records/companies/northstar]]"),
"on-disk wiki-link brackets were destroyed; got:\n{raw}"
);
assert!(
!raw.contains("- - "),
"on-disk value became a nested block sequence; got:\n{raw}"
);
// And the edge is still readable after the round-trip.
let (fm2, _) = read_file(&path).unwrap();
let fields = fm2.link_fields();
let links: Vec<(&str, &str)> = fields
.iter()
.map(|(k, l)| (k.as_str(), l.target.as_str()))
.collect();
assert_eq!(links, vec![("company", "records/companies/northstar")]);
}
#[test]
fn write_file_does_not_leave_temp_files_behind() {
let dir = tempdir().unwrap();
let path = dir.path().join("records/x.md");
let fm = Frontmatter {
type_: Some("note".into()),
..Default::default()
};
write_file(&path, &fm, "body\n").unwrap();
// The directory should contain only the target file, no `.x.md.tmp.*`.
let entries: Vec<String> = std::fs::read_dir(path.parent().unwrap())
.unwrap()
.map(|e| e.unwrap().file_name().to_string_lossy().into_owned())
.collect();
assert_eq!(entries, vec!["x.md".to_string()]);
}
// ── is_content_file ──────────────────────────────────────────────────────
#[test]
fn is_content_file_recognizes_layers_and_excludes_meta() {
assert!(Frontmatter::is_content_file(Path::new(
"sources/emails/2026-05-22.md"
)));
assert!(Frontmatter::is_content_file(Path::new(
"records/contacts/sarah-chen.md"
)));
assert!(Frontmatter::is_content_file(Path::new(
"wiki/people/sarah-chen.md"
)));
// Absolute paths under a layer are still content.
assert!(Frontmatter::is_content_file(Path::new(
"/home/db/records/companies/northstar.md"
)));
// index.md at any level is meta.
assert!(!Frontmatter::is_content_file(Path::new(
"records/contacts/index.md"
)));
assert!(!Frontmatter::is_content_file(Path::new("index.md")));
// Root meta files.
assert!(!Frontmatter::is_content_file(Path::new("DB.md")));
assert!(!Frontmatter::is_content_file(Path::new("log.md")));
}
// ── effective_id ─────────────────────────────────────────────────────────
#[test]
fn effective_id_prefers_explicit_then_derives_from_path() {
let with_id = Frontmatter {
id: Some("explicit-id".into()),
..Default::default()
};
assert_eq!(
with_id.effective_id(Path::new("wiki/people/sarah-chen.md")),
"explicit-id"
);
let no_id = Frontmatter::default();
assert_eq!(
no_id.effective_id(Path::new("wiki/people/sarah-chen.md")),
"sarah-chen"
);
}
// ── get / set ────────────────────────────────────────────────────────────
#[test]
fn set_routes_universal_and_custom_keys() {
let mut fm = Frontmatter::default();
fm.set("type", "contact").unwrap();
fm.set("summary", "hi").unwrap();
fm.set("company", "[[records/companies/northstar]]")
.unwrap();
assert_eq!(fm.type_.as_deref(), Some("contact"));
assert_eq!(fm.summary.as_deref(), Some("hi"));
// Custom key landed in extra, not a typed slot.
assert_eq!(
fm.extra.get("company").and_then(|v| v.as_str()),
Some("[[records/companies/northstar]]")
);
// get reads from both typed fields and extra.
assert_eq!(
fm.get("type").and_then(|v| v.as_str().map(String::from)),
Some("contact".into())
);
assert_eq!(
fm.get("company").and_then(|v| v.as_str().map(String::from)),
Some("[[records/companies/northstar]]".into())
);
assert!(fm.get("nonexistent").is_none());
}
#[test]
fn set_timestamp_validates_rfc3339() {
let mut fm = Frontmatter::default();
fm.set("created", "2026-05-27T08:00:00-07:00").unwrap();
assert!(fm.created.is_some());
let err = fm.set("updated", "not-a-date").unwrap_err();
assert!(matches!(err, ParseError::BadTimestamp { .. }));
}
// ── extract_wiki_links ───────────────────────────────────────────────────
#[test]
fn extract_wiki_links_flags_full_path_short_form_and_extension() {
let body = "See [[records/contacts/sarah-chen]] and [[sarah-chen]].\nAlso [[wiki/people/sarah-chen.md|Sarah]].\n";
let links = extract_wiki_links(body, Path::new("doc.md"));
assert_eq!(links.len(), 3);
// Full path, no extension, no display.
assert_eq!(links[0].target, "records/contacts/sarah-chen");
assert!(links[0].is_full_path);
assert!(!links[0].has_md_extension);
assert_eq!(links[0].display, None);
assert_eq!(links[0].location.1, 1, "first link on line 1");
// Short form: not a full path.
assert_eq!(links[1].target, "sarah-chen");
assert!(!links[1].is_full_path, "bare target is short-form");
// Full path WITH .md extension and a display override on line 2.
assert_eq!(links[2].target, "wiki/people/sarah-chen.md");
assert!(links[2].is_full_path);
assert!(links[2].has_md_extension);
assert_eq!(links[2].display.as_deref(), Some("Sarah"));
assert_eq!(links[2].location.1, 2);
}
#[test]
fn extract_wiki_links_reports_1_based_column_counting_chars() {
// A multi-byte prefix (é is 2 bytes) must not skew the char column.
let body = "café [[records/x/y]]";
let links = extract_wiki_links(body, Path::new("d.md"));
assert_eq!(links.len(), 1);
// "café " is 5 chars, so the `[[` starts at char column 6 (1-based).
assert_eq!(links[0].location.2, 6);
}
#[test]
fn extract_wiki_links_ignores_a_lone_path_without_brackets() {
let links = extract_wiki_links(
"records/contacts/sarah-chen is not a link",
Path::new("d.md"),
);
assert!(links.is_empty());
}
// ── extract_markdown_links ───────────────────────────────────────────────
#[test]
fn extract_markdown_links_captures_external_and_not_wiki_links() {
let body =
"See [the thread](https://x.com/a) and [[records/contacts/sarah-chen]] internally.\n";
let md = extract_markdown_links(body, Path::new("d.md"));
assert_eq!(
md.len(),
1,
"wiki-link must not be captured as a markdown link"
);
assert_eq!(md[0].text, "the thread");
assert_eq!(md[0].url, "https://x.com/a");
assert_eq!(md[0].location.1, 1);
// And the wiki-link extractor must not pick up the markdown link.
let wl = extract_wiki_links(body, Path::new("d.md"));
assert_eq!(wl.len(), 1);
assert_eq!(wl[0].target, "records/contacts/sarah-chen");
}
// ── link_fields ──────────────────────────────────────────────────────────
#[test]
fn link_fields_extracts_scalar_list_and_summary_links() {
// The canonical list form quotes each item so YAML parses it as clean
// strings; a scalar field may be quoted OR written in the canonical
// unquoted inline form `company: [[x]]` (SPEC § Linking).
let yaml = "type: meeting\nsummary: with [[records/contacts/elena]]\ncompany: \"[[records/companies/northstar]]\"\nattendees:\n - \"[[records/contacts/elena]]\"\n - \"[[records/contacts/sarah]]\"\nnotes: just plain text";
let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
// Sanity: company really did parse as a scalar string here.
assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_some());
let fields = fm.link_fields();
// company (scalar) once, with the right target.
let company: Vec<&str> = fields
.iter()
.filter(|(k, _)| k == "company")
.map(|(_, l)| l.target.as_str())
.collect();
assert_eq!(company, vec!["records/companies/northstar"]);
// attendees (block list) twice.
let attendees: Vec<&str> = fields
.iter()
.filter(|(k, _)| k == "attendees")
.map(|(_, l)| l.target.as_str())
.collect();
assert_eq!(
attendees,
vec!["records/contacts/elena", "records/contacts/sarah"]
);
// summary link surfaced.
assert_eq!(fields.iter().filter(|(k, _)| k == "summary").count(), 1);
// Plain-text field is not a link.
assert_eq!(fields.iter().filter(|(k, _)| k == "notes").count(), 0);
}
#[test]
fn link_fields_surfaces_canonical_unquoted_scalar_link() {
// Regression: the canonical scalar wiki-link form is the *unquoted*
// inline `company: [[records/companies/northstar]]` (SPEC § Linking).
// YAML parses `[[x]]` as a flow-list-in-a-list (`Seq[Seq[String]]`), so
// a naive `as_str()`-only walk drops it. link_fields() must still
// surface exactly one link with the correct target.
let yaml = "type: meeting\ncompany: [[records/companies/northstar]]";
let fm = Frontmatter::parse(yaml, Path::new("m.md")).unwrap();
// Sanity: it really did parse as the nested sequence form, NOT a string.
assert!(fm.extra.get("company").and_then(|v| v.as_str()).is_none());
let fields = fm.link_fields();
let links: Vec<(&str, &str, Option<&str>)> = fields
.iter()
.map(|(k, l)| (k.as_str(), l.target.as_str(), l.display.as_deref()))
.collect();
assert_eq!(
links,
vec![("company", "records/companies/northstar", None)]
);
// The `|display` segment survives the unquoted inline form too.
let fm2 = Frontmatter::parse(
"type: meeting\ncompany: [[records/companies/northstar|Northstar]]",
Path::new("m.md"),
)
.unwrap();
let f2 = fm2.link_fields();
assert_eq!(f2.len(), 1);
assert_eq!(f2[0].0, "company");
assert_eq!(f2[0].1.target, "records/companies/northstar");
assert_eq!(f2[0].1.display.as_deref(), Some("Northstar"));
}
#[test]
fn link_fields_ignores_plain_one_item_flow_list() {
// A plain one-item flow list `aliases: [foo]` parses to `Seq[String]`
// — one nesting level shallower than an unquoted `[[foo]]` — and must
// NOT be mistaken for a wiki-link.
let yaml = "type: contact\naliases: [foo]";
let fm = Frontmatter::parse(yaml, Path::new("c.md")).unwrap();
assert_eq!(fm.link_fields(), Vec::new());
}
// ── detect_flow_form_link_lists ──────────────────────────────────────────
#[test]
fn detect_flow_form_flags_list_misencodings_not_scalars() {
// The flow-form list mis-encoding (triple-nested) IS flagged; a scalar
// inline wiki-link (double-nested) is NOT.
let bad = "attendees: [[[records/x]], [[records/y]]]\nscalar_inline: [[records/z]]";
let flagged = detect_flow_form_link_lists(bad);
assert_eq!(flagged, vec!["attendees".to_string()]);
// An UNquoted block list is also a mis-encoding (parses triple-nested).
let unquoted_block = "attendees:\n - [[records/x]]\n - [[records/y]]";
assert_eq!(
detect_flow_form_link_lists(unquoted_block),
vec!["attendees".to_string()]
);
// The canonical QUOTED block form parses to clean strings — NOT flagged.
let good = "attendees:\n - \"[[records/x]]\"\n - \"[[records/y]]\"";
assert!(detect_flow_form_link_lists(good).is_empty());
// A plain scalar list of strings is not flagged.
let plain = "tags: [a, b, c]";
assert!(detect_flow_form_link_lists(plain).is_empty());
}
// ── extract_sections ─────────────────────────────────────────────────────
#[test]
fn extract_sections_levels_nesting_and_boundaries() {
let body = "intro text\n## First\nalpha\n### Sub\nbeta\n## Second\ngamma\n";
let secs = extract_sections(body);
let headings: Vec<(&str, u8)> =
secs.iter().map(|s| (s.heading.as_str(), s.level)).collect();
assert_eq!(headings, vec![("First", 2), ("Sub", 3), ("Second", 2)]);
// "First" (H2) body extends through its H3 child, stopping at "Second".
let first = &secs[0];
assert!(first.body.contains("alpha"));
assert!(first.body.contains("### Sub"));
assert!(first.body.contains("beta"));
assert!(!first.body.contains("Second"));
// "Sub" (H3) stops at the next equal-or-shallower heading ("Second").
let sub = &secs[1];
assert!(sub.body.contains("beta"));
assert!(!sub.body.contains("gamma"));
// 1-based line numbers within the body.
assert_eq!(first.line, 2);
assert_eq!(secs[2].line, 6);
}
#[test]
fn extract_sections_ignores_headings_in_fenced_code() {
let body = "## Real\n```\n## Fake heading in code\n```\nafter\n";
let secs = extract_sections(body);
assert_eq!(secs.len(), 1);
assert_eq!(secs[0].heading, "Real");
// The fenced "## Fake" is part of Real's body, not its own section.
assert!(secs[0].body.contains("## Fake heading in code"));
}
// ── parse_field_spec ─────────────────────────────────────────────────────
#[test]
fn parse_field_spec_required_and_shape() {
let f = parse_field_spec("- email (required, email)");
assert_eq!(f.name, "email");
assert!(f.required);
assert_eq!(f.shape, Some(Shape::Email));
assert!(f.unknown_modifiers.is_empty());
}
#[test]
fn parse_field_spec_link_prefix_strips_trailing_slash() {
let f = parse_field_spec("- company (required, link to records/companies/)");
assert!(f.required);
assert_eq!(f.link_prefix, Some(PathBuf::from("records/companies")));
assert_eq!(f.shape, None);
}
#[test]
fn parse_field_spec_default_preserves_case_and_value() {
let f = parse_field_spec("- currency (default USD)");
assert_eq!(f.name, "currency");
assert_eq!(f.default, Some(Value::String("USD".into())));
}
#[test]
fn parse_field_spec_enum_captures_comma_list_as_last_modifier() {
let f = parse_field_spec("- status (required, enum: open, closed, pending)");
assert!(f.required);
assert_eq!(
f.enum_values,
Some(vec![
"open".to_string(),
"closed".to_string(),
"pending".to_string()
])
);
}
#[test]
fn parse_field_spec_unknown_modifier_is_captured_not_errored() {
let f = parse_field_spec("- weird (required, frobnicate, string)");
assert!(f.required);
assert_eq!(f.shape, Some(Shape::String));
assert_eq!(f.unknown_modifiers, vec!["frobnicate".to_string()]);
}
#[test]
fn parse_field_spec_no_parens_is_freeform_optional() {
let f = parse_field_spec("- nickname");
assert_eq!(f.name, "nickname");
assert!(!f.required);
assert_eq!(f.shape, None);
assert!(f.link_prefix.is_none());
assert!(f.enum_values.is_none());
assert!(f.unknown_modifiers.is_empty());
}
// ── parse_db_md ──────────────────────────────────────────────────────────
const CANONICAL_DB_MD: &str = "---\ntype: db-md\nscope: company\nowner: Sarah Chen\n---\n\n# Acme operations knowledge base\n\nCompany-scale institutional memory for Acme.\n\n## Agent instructions\n\nPrioritize creating `contact` records from new-sender emails. Use British English.\n\n## Policies\n\n### Frozen pages\n- `records/decisions/2026-q1-strategy.md` — finalized, do not modify.\n- `wiki/synthesis/2026-annual-plan.md` — signed-off plan.\n\n### Ignored types\n- `test`, `temp` — read but never synthesize.\n\n## Schemas\n\n### contact\n- name (required)\n- email (required, email)\n- company (required, link to records/companies/)\n- role (string)\n\n### expense\n- date (required, date)\n- amount (required)\n- currency (default USD)\n";
#[test]
fn parse_db_md_extracts_all_canonical_sections() {
let config = parse_db_md(CANONICAL_DB_MD, Path::new("DB.md")).unwrap();
// Agent instructions: free-form prose, heading line stripped.
let ai = config
.agent_instructions
.expect("agent instructions present");
assert!(ai.starts_with("Prioritize creating"));
assert!(!ai.contains("## Agent instructions"));
// Frozen pages: paths extracted from backticked bullets, comments dropped.
assert_eq!(
config.frozen_pages,
vec![
PathBuf::from("records/decisions/2026-q1-strategy.md"),
PathBuf::from("wiki/synthesis/2026-annual-plan.md"),
]
);
// Ignored types: comma list, backticks/comment stripped.
assert_eq!(
config.ignored_types,
vec!["test".to_string(), "temp".to_string()]
);
// Schemas: two types, each with its fields in source order.
assert_eq!(config.schemas.len(), 2);
let contact = config.schemas.get("contact").expect("contact schema");
let names: Vec<&str> = contact.fields.iter().map(|f| f.name.as_str()).collect();
assert_eq!(names, vec!["name", "email", "company", "role"]);
assert!(contact.fields[0].required); // name
assert_eq!(contact.fields[1].shape, Some(Shape::Email)); // email
assert_eq!(
contact.fields[2].link_prefix,
Some(PathBuf::from("records/companies"))
); // company
let expense = config.schemas.get("expense").expect("expense schema");
let cur = expense
.fields
.iter()
.find(|f| f.name == "currency")
.unwrap();
assert_eq!(cur.default, Some(Value::String("USD".into())));
}
#[test]
fn parse_db_md_handles_malformed_and_unknown_modifiers() {
// corpus-b shape: a `## Schemas` section with a malformed bullet, an
// unknown modifier, and bullets that appear with NO `### <type>`
// heading (so they belong to no schema and are dropped).
let text = "---\ntype: db-md\n---\n\n## Schemas\n- orphan (required)\n\n### ticket\n- priority (required, mystery, enum: low, high)\n- broken (\n";
let config = parse_db_md(text, Path::new("DB.md")).unwrap();
// The orphan bullet under `## Schemas` with no `### type` heading is not
// captured as a schema.
assert_eq!(config.schemas.len(), 1);
let ticket = config.schemas.get("ticket").expect("ticket schema");
assert_eq!(ticket.fields.len(), 2);
let priority = &ticket.fields[0];
assert!(priority.required);
assert_eq!(priority.unknown_modifiers, vec!["mystery".to_string()]);
assert_eq!(
priority.enum_values,
Some(vec!["low".to_string(), "high".to_string()])
);
// A bullet with an unclosed paren still yields a usable name.
let broken = &ticket.fields[1];
assert_eq!(broken.name, "broken");
}
#[test]
fn parse_db_md_missing_frontmatter_errors() {
let text = "# No frontmatter\n\n## Agent instructions\nhi\n";
let err = parse_db_md(text, Path::new("DB.md")).unwrap_err();
assert!(matches!(err, ParseError::MissingFrontmatter { .. }));
}
#[test]
fn parse_db_md_absent_sections_default_empty() {
let text = "---\ntype: db-md\n---\n\n# Title only\n";
let config = parse_db_md(text, Path::new("DB.md")).unwrap();
assert_eq!(config, Config::default());
}
// ── fm set / --fm list-valued link fields (meeting.attendees & friends) ──
/// `Frontmatter::set` is the value path every write surface (`fm set`,
/// `write --fm`) funnels through. A list-of-wiki-links value (the SPEC's
/// `meeting.attendees` shape) must serialize as a YAML **block sequence** of
/// quoted links — readable back by [`links_in_field_value`] and accepted by
/// `dbmd validate` — never the flow-form scalar string that trips
/// `WIKI_LINK_FLOW_FORM_LIST`. Both the unquoted (`[[[a]], [[b]]]`) and
/// quoted (`["[[a]]", "[[b]]"]`) spellings an agent types must normalize.
#[test]
fn set_list_of_wiki_links_becomes_block_sequence_both_spellings() {
for value in [
"[[[records/contacts/a]], [[records/contacts/b]]]",
r#"["[[records/contacts/a]]", "[[records/contacts/b]]"]"#,
] {
let mut fm = Frontmatter::default();
fm.set("attendees", value).unwrap();
// Stored as a 2-element sequence of clean quoted links.
let stored = fm.extra.get("attendees").expect("attendees set");
let Value::Sequence(items) = stored else {
panic!("attendees must be a Sequence, got {stored:?} for input {value}");
};
assert_eq!(items.len(), 2, "input {value}");
assert_eq!(items[0], Value::String("[[records/contacts/a]]".into()));
assert_eq!(items[1], Value::String("[[records/contacts/b]]".into()));
// The edge enumerator reads exactly the two links back (no stray
// bracket targets, the flow-form-string symptom).
let links: Vec<_> = links_in_field_value(stored)
.into_iter()
.map(|l| l.target)
.collect();
assert_eq!(
links,
vec!["records/contacts/a", "records/contacts/b"],
"input {value}"
);
// And the canonical writer renders it block-style, not as a scalar.
let yaml = fm.to_yaml();
assert!(
yaml.contains("attendees:\n"),
"expected block list in:\n{yaml}"
);
assert!(
!yaml.contains("attendees: '[["),
"must not be a flow-form scalar string in:\n{yaml}"
);
}
}
/// A *single* inline wiki-link stays a scalar string (renders inline
/// `field: [[x]]`), and a single link must never be widened to a one-item
/// list — preserving the common `contact.company` / `expense.vendor` shape.
#[test]
fn set_single_inline_wiki_link_stays_scalar() {
let mut fm = Frontmatter::default();
fm.set("company", "[[records/companies/tideform]]").unwrap();
assert_eq!(
fm.extra.get("company"),
Some(&Value::String("[[records/companies/tideform]]".into())),
);
// Still recognized as one link.
let links: Vec<_> = links_in_field_value(fm.extra.get("company").unwrap())
.into_iter()
.map(|l| l.target)
.collect();
assert_eq!(links, vec!["records/companies/tideform"]);
}
/// Plain text and a non-link flow list are left as verbatim scalar strings —
/// the list normalization only triggers when every item is a clean wiki-link.
#[test]
fn set_non_link_values_stay_scalar_strings() {
let mut fm = Frontmatter::default();
fm.set("location", "Video call (remote)").unwrap();
assert_eq!(
fm.extra.get("location"),
Some(&Value::String("Video call (remote)".into())),
);
// A flow list whose items are NOT wiki-links must not be reinterpreted as
// a link sequence; it stays the scalar string the agent passed.
fm.set("note", "[draft, wip]").unwrap();
assert_eq!(
fm.extra.get("note"),
Some(&Value::String("[draft, wip]".into()))
);
}
}