Skip to main content

corpora_engine/
toml_format.rs

1//! Real TOML front-matter + markdown body parsing (plan Phase 2).
2//!
3//! A corpus document is a `+++`-fenced TOML header followed by a markdown body:
4//!
5//! ```text
6//! +++
7//! id = "D2"
8//! kind = "decision"
9//! ...
10//! +++
11//!
12//! markdown body…
13//! ```
14//!
15//! The header maps to a [`Record`] (kind → [`Facet`], with field-level errors collected
16//! and joined into one [`ParseError`]); the body is scanned with `pulldown-cmark` for
17//! bare-id mentions (skipping code), `§` section refs, links, and finding anchors.
18
19use std::collections::BTreeSet;
20
21use corpora_core::{
22    Authority, Body, Date, DecisionFacet, DocPath, Door, Edges, Facet, Finding, FindingId, Fork,
23    Id, Impl, Kind, Lifecycle, ParseError, Record, Rev, Status,
24};
25use pulldown_cmark::{Event, Parser as MdParser, Tag, TagEnd};
26
27use crate::parser::FrontMatterFormat;
28
29pub struct TomlFormat;
30
31impl FrontMatterFormat for TomlFormat {
32    fn parse(&self, path: &DocPath, text: &str) -> Result<Record, ParseError> {
33        let (fm_src, body_src) = split_front_matter(text)
34            .ok_or_else(|| ParseError("missing `+++` TOML front matter".into()))?;
35
36        let table: toml::Table = fm_src
37            .parse()
38            .map_err(|e| ParseError(format!("invalid TOML front matter: {e}")))?;
39
40        let mut errs: Vec<String> = Vec::new();
41
42        // `kind` is load-bearing (it picks the facet); without it we can't build a record.
43        let kind = match table.get("kind").and_then(|v| v.as_str()).and_then(parse_kind) {
44            Some(k) => k,
45            None => return Err(ParseError("missing or invalid `kind`".into())),
46        };
47
48        let lifecycle =
49            require_enum(&table, "lifecycle", parse_lifecycle, &mut errs).unwrap_or(Lifecycle::Draft);
50        let authority =
51            require_enum(&table, "authority", parse_authority, &mut errs).unwrap_or(Authority::Normative);
52        let last_reviewed = match get_date(&table, "last_reviewed", &mut errs) {
53            Some(d) => Date(d),
54            None => {
55                if !table.contains_key("last_reviewed") {
56                    errs.push("missing `last_reviewed`".into());
57                }
58                Date(String::new())
59            }
60        };
61
62        let facet = build_facet(kind, &table, &mut errs);
63
64        // schema-v0 §4: `id` is common to every record. Without it a record is invisible to
65        // graph indexing, supersession, and E3 — so a missing id is an error, not allowed.
66        let id = get_str(&table, "id", &mut errs).map(Id);
67        if id.is_none() && !table.contains_key("id") {
68            errs.push("missing `id`".into());
69        }
70        let aka: Vec<Id> = get_ids(&table, "aka", &mut errs);
71        let edges = Edges {
72            depends_on: get_ids(&table, "depends_on", &mut errs),
73            supersedes: get_ids(&table, "supersedes", &mut errs),
74            related: get_ids(&table, "related", &mut errs),
75            supports: get_ids(&table, "supports", &mut errs),
76            driven_by: get_array_str(&table, "driven_by", &mut errs)
77                .into_iter()
78                .map(FindingId)
79                .collect(),
80        };
81
82        // Anything already cited via a structured edge (or the record's own id/aka) is not a
83        // "bare" mention — exclude it so E3 doesn't double-flag.
84        let mut exclude: BTreeSet<String> = BTreeSet::new();
85        if let Some(i) = &id {
86            exclude.insert(i.0.clone());
87        }
88        exclude.extend(aka.iter().map(|i| i.0.clone()));
89        for e in edges
90            .depends_on
91            .iter()
92            .chain(&edges.supersedes)
93            .chain(&edges.related)
94            .chain(&edges.supports)
95        {
96            exclude.insert(e.0.clone());
97        }
98        let body = extract_body(body_src, &exclude);
99
100        if !errs.is_empty() {
101            return Err(ParseError(errs.join("; ")));
102        }
103
104        Ok(Record {
105            id,
106            path: path.clone(),
107            kind,
108            lifecycle,
109            authority,
110            last_reviewed,
111            aka,
112            edges,
113            facet,
114            body,
115        })
116    }
117}
118
119fn build_facet(kind: Kind, t: &toml::Table, errs: &mut Vec<String>) -> Facet {
120    match kind {
121        Kind::Decision => {
122            let status = require_enum(t, "status", parse_status, errs).unwrap_or(Status::Proposed);
123            let date = match get_date(t, "date", errs) {
124                Some(d) => Date(d),
125                None => {
126                    if !t.contains_key("date") {
127                        errs.push("decision missing `date`".into());
128                    }
129                    Date(String::new())
130                }
131            };
132            let implementation = opt_enum(t, "implementation", parse_impl, errs);
133            let fork = if status == Status::Open {
134                match (
135                    get_str(t, "lean", errs),
136                    get_str(t, "decide_when", errs),
137                    opt_enum(t, "door", parse_door, errs),
138                ) {
139                    (Some(lean), Some(decide_when), Some(door)) => Some(Fork {
140                        lean,
141                        decide_when,
142                        door,
143                    }),
144                    _ => {
145                        errs.push("open decision requires `lean`, `decide_when`, `door`".into());
146                        None
147                    }
148                }
149            } else {
150                None
151            };
152            Facet::Decision(DecisionFacet {
153                status,
154                date,
155                implementation,
156                fork,
157                realized_by: get_ids(t, "realized_by", errs),
158            })
159        }
160        Kind::Axiom => {
161            // An axiom is an assumed external fact: it carries no code (schema-v0 §4).
162            for forbidden in ["implementation", "code_revision"] {
163                if t.contains_key(forbidden) {
164                    errs.push(format!("axiom must not carry `{forbidden}`"));
165                }
166            }
167            Facet::Axiom
168        }
169        Kind::Invariant | Kind::Architecture => {
170            let implementation = opt_enum(t, "implementation", parse_impl, errs);
171            let code_revision = get_str(t, "code_revision", errs).map(Rev);
172            if implementation.is_some() && code_revision.is_none() && !t.contains_key("code_revision") {
173                errs.push("`implementation` present requires `code_revision`".into());
174            }
175            Facet::Canon {
176                implementation,
177                code_revision,
178            }
179        }
180        Kind::Current => Facet::Current {
181            implementation: require_enum(t, "implementation", parse_impl, errs).unwrap_or(Impl::Absent),
182            code_revision: require_rev(t, "code_revision", errs),
183            source_revision: get_str(t, "source_revision", errs).map(Rev),
184        },
185        Kind::Roadmap | Kind::Milestone => Facet::Plan {
186            implementation: require_enum(t, "implementation", parse_impl, errs).unwrap_or(Impl::Absent),
187            code_revision: require_rev(t, "code_revision", errs),
188        },
189        Kind::Evidence => {
190            let imp = opt_enum(t, "implementation", parse_impl, errs);
191            let rev = get_str(t, "code_revision", errs).map(Rev);
192            let measured = match (imp, rev) {
193                (Some(i), Some(r)) => Some((i, r)),
194                (None, None) => None,
195                _ => {
196                    errs.push("evidence `implementation` and `code_revision` must be paired".into());
197                    None
198                }
199            };
200            Facet::Evidence {
201                measured,
202                source_revision: get_str(t, "source_revision", errs).map(Rev),
203            }
204        }
205        Kind::ReviewLog | Kind::Evolution | Kind::Handoff | Kind::Explainer | Kind::Index => {
206            Facet::Narrative
207        }
208    }
209}
210
211// ---- front-matter helpers -------------------------------------------------
212//
213// Every accessor is type-strict: a key that is *present but the wrong type* is an error,
214// not silently treated as absent. Otherwise `supersedes = "D1"` (a string, not an array)
215// would quietly drop a graph edge, and a wrong-typed enum would read as missing.
216
217fn get_str(t: &toml::Table, key: &str, errs: &mut Vec<String>) -> Option<String> {
218    match t.get(key) {
219        None => None,
220        Some(toml::Value::String(s)) => Some(s.clone()),
221        Some(_) => {
222            errs.push(format!("`{key}` must be a string"));
223            None
224        }
225    }
226}
227
228fn get_array_str(t: &toml::Table, key: &str, errs: &mut Vec<String>) -> Vec<String> {
229    match t.get(key) {
230        None => Vec::new(),
231        Some(toml::Value::Array(a)) => {
232            let mut out = Vec::with_capacity(a.len());
233            for (i, x) in a.iter().enumerate() {
234                match x {
235                    toml::Value::String(s) => out.push(s.clone()),
236                    _ => errs.push(format!("`{key}[{i}]` must be a string")),
237                }
238            }
239            out
240        }
241        Some(_) => {
242            errs.push(format!("`{key}` must be an array of strings"));
243            Vec::new()
244        }
245    }
246}
247
248fn get_ids(t: &toml::Table, key: &str, errs: &mut Vec<String>) -> Vec<Id> {
249    get_array_str(t, key, errs).into_iter().map(Id).collect()
250}
251
252/// Accept a plain `YYYY-MM-DD` date — either a bare TOML date or a string in that exact
253/// form. A full TOML date-time (with time/offset) or any other string is rejected.
254fn get_date(t: &toml::Table, key: &str, errs: &mut Vec<String>) -> Option<String> {
255    match t.get(key) {
256        None => None,
257        Some(toml::Value::Datetime(d)) => match (&d.date, &d.time, &d.offset) {
258            (Some(date), None, None) => {
259                Some(format!("{:04}-{:02}-{:02}", date.year, date.month, date.day))
260            }
261            _ => {
262                errs.push(format!("`{key}` must be a plain date (YYYY-MM-DD), not a date-time"));
263                None
264            }
265        },
266        Some(toml::Value::String(s)) if is_iso_date(s) => Some(s.clone()),
267        Some(toml::Value::String(s)) => {
268            errs.push(format!("`{key}` must be a date in YYYY-MM-DD form, got {s:?}"));
269            None
270        }
271        Some(_) => {
272            errs.push(format!("`{key}` must be a date"));
273            None
274        }
275    }
276}
277
278/// `YYYY-MM-DD` shape *and* a real calendar date — `2026-99-99` and `2026-02-30` are
279/// rejected, not just mis-shaped strings.
280fn is_iso_date(s: &str) -> bool {
281    let b = s.as_bytes();
282    let shaped = b.len() == 10
283        && b[4] == b'-'
284        && b[7] == b'-'
285        && (0..4).chain(5..7).chain(8..10).all(|i| b[i].is_ascii_digit());
286    if !shaped {
287        return false;
288    }
289    let num = |r: std::ops::Range<usize>| s[r].parse::<u32>().unwrap_or(0);
290    let (year, month, day) = (num(0..4), num(5..7), num(8..10));
291    (1..=12).contains(&month) && (1..=days_in_month(year, month)).contains(&day)
292}
293
294fn days_in_month(year: u32, month: u32) -> u32 {
295    match month {
296        1 | 3 | 5 | 7 | 8 | 10 | 12 => 31,
297        4 | 6 | 9 | 11 => 30,
298        2 if (year % 4 == 0 && year % 100 != 0) || year % 400 == 0 => 29,
299        2 => 28,
300        _ => 0,
301    }
302}
303
304/// A required string: present-but-wrong-type and absent are both errors.
305fn require_str(t: &toml::Table, key: &str, errs: &mut Vec<String>) -> Option<String> {
306    let v = get_str(t, key, errs);
307    if v.is_none() && !t.contains_key(key) {
308        errs.push(format!("missing `{key}`"));
309    }
310    v
311}
312
313fn require_rev(t: &toml::Table, key: &str, errs: &mut Vec<String>) -> Rev {
314    Rev(require_str(t, key, errs).unwrap_or_default())
315}
316
317fn require_enum<T>(
318    t: &toml::Table,
319    key: &str,
320    f: impl Fn(&str) -> Option<T>,
321    errs: &mut Vec<String>,
322) -> Option<T> {
323    match t.get(key) {
324        None => {
325            errs.push(format!("missing `{key}`"));
326            None
327        }
328        Some(toml::Value::String(s)) => match f(s) {
329            Some(v) => Some(v),
330            None => {
331                errs.push(format!("invalid `{key}` = {s:?}"));
332                None
333            }
334        },
335        Some(_) => {
336            errs.push(format!("`{key}` must be a string"));
337            None
338        }
339    }
340}
341
342fn opt_enum<T>(
343    t: &toml::Table,
344    key: &str,
345    f: impl Fn(&str) -> Option<T>,
346    errs: &mut Vec<String>,
347) -> Option<T> {
348    match t.get(key) {
349        None => None,
350        Some(toml::Value::String(s)) => match f(s) {
351            Some(v) => Some(v),
352            None => {
353                errs.push(format!("invalid `{key}` = {s:?}"));
354                None
355            }
356        },
357        Some(_) => {
358            errs.push(format!("`{key}` must be a string"));
359            None
360        }
361    }
362}
363
364fn parse_kind(s: &str) -> Option<Kind> {
365    Some(match s {
366        "decision" => Kind::Decision,
367        "axiom" => Kind::Axiom,
368        "invariant" => Kind::Invariant,
369        "architecture" => Kind::Architecture,
370        "current" => Kind::Current,
371        "roadmap" => Kind::Roadmap,
372        "milestone" => Kind::Milestone,
373        "evidence" => Kind::Evidence,
374        "review-log" => Kind::ReviewLog,
375        "evolution" => Kind::Evolution,
376        "handoff" => Kind::Handoff,
377        "explainer" => Kind::Explainer,
378        "index" => Kind::Index,
379        _ => return None,
380    })
381}
382
383fn parse_lifecycle(s: &str) -> Option<Lifecycle> {
384    Some(match s {
385        "draft" => Lifecycle::Draft,
386        "current" => Lifecycle::Current,
387        "superseded" => Lifecycle::Superseded,
388        "historical" => Lifecycle::Historical,
389        _ => return None,
390    })
391}
392
393fn parse_authority(s: &str) -> Option<Authority> {
394    Some(match s {
395        "normative" => Authority::Normative,
396        "axiomatic" => Authority::Axiomatic,
397        "descriptive" => Authority::Descriptive,
398        "prospective" => Authority::Prospective,
399        "evidence" => Authority::Evidence,
400        "historical" => Authority::Historical,
401        "operational" => Authority::Operational,
402        "explanatory" => Authority::Explanatory,
403        "navigational" => Authority::Navigational,
404        _ => return None,
405    })
406}
407
408fn parse_status(s: &str) -> Option<Status> {
409    Some(match s {
410        "open" => Status::Open,
411        "proposed" => Status::Proposed,
412        "accepted" => Status::Accepted,
413        "superseded" => Status::Superseded,
414        "deprecated" => Status::Deprecated,
415        "rejected" => Status::Rejected,
416        _ => return None,
417    })
418}
419
420fn parse_impl(s: &str) -> Option<Impl> {
421    Some(match s {
422        "absent" => Impl::Absent,
423        "scaffold" => Impl::Scaffold,
424        "partial" => Impl::Partial,
425        "implemented" => Impl::Implemented,
426        "verified" => Impl::Verified,
427        _ => return None,
428    })
429}
430
431fn parse_door(s: &str) -> Option<Door> {
432    Some(match s {
433        "reversible" => Door::Reversible,
434        "one-way" => Door::OneWay,
435        _ => return None,
436    })
437}
438
439/// Split `+++`-fenced TOML front matter from the markdown body. Returns
440/// `(front_matter, body)` or `None` if the document isn't fenced.
441fn split_front_matter(text: &str) -> Option<(&str, &str)> {
442    let text = text.strip_prefix('\u{feff}').unwrap_or(text); // drop a BOM
443    let text = text.trim_start_matches(['\r', '\n']);
444    let rest = text.strip_prefix("+++")?;
445    let rest = rest
446        .strip_prefix("\r\n")
447        .or_else(|| rest.strip_prefix('\n'))?;
448
449    let mut from = 0;
450    loop {
451        let rel = rest[from..].find("+++")?;
452        let at = from + rel;
453        let at_line_start = at == 0 || rest.as_bytes()[at - 1] == b'\n';
454        let after = &rest[at + 3..];
455        let after = after.strip_prefix('\r').unwrap_or(after);
456        let at_line_end = after.is_empty() || after.starts_with('\n');
457        if at_line_start && at_line_end {
458            let body = after.strip_prefix('\n').unwrap_or(after);
459            return Some((&rest[..at], body));
460        }
461        from = at + 3;
462    }
463}
464
465// ---- body extraction ------------------------------------------------------
466
467fn extract_body(src: &str, exclude: &BTreeSet<String>) -> Body {
468    let mut bare = Vec::new();
469    let mut seen_bare: BTreeSet<String> = BTreeSet::new();
470    let mut findings = Vec::new();
471    let mut seen_find: BTreeSet<String> = BTreeSet::new();
472    let mut section_refs = Vec::new();
473    let mut links = Vec::new();
474    let mut code_depth = 0usize;
475
476    for ev in MdParser::new(src) {
477        match ev {
478            Event::Start(Tag::CodeBlock(_)) => code_depth += 1,
479            Event::End(TagEnd::CodeBlock) => code_depth = code_depth.saturating_sub(1),
480            Event::Text(t) if code_depth == 0 => {
481                scan_ids(&t, exclude, &mut bare, &mut seen_bare);
482                scan_sections(&t, &mut section_refs);
483            }
484            Event::Start(Tag::Link { dest_url, .. }) => links.push(dest_url.to_string()),
485            Event::Html(h) | Event::InlineHtml(h) => scan_anchors(&h, &mut findings, &mut seen_find),
486            _ => {} // inline `Code`, code-block text, soft breaks, etc. are ignored
487        }
488    }
489
490    dedup_preserve(&mut section_refs);
491    dedup_preserve(&mut links);
492
493    // Ids named by *local* link destinations are typed (Relation::Link) citations, not bare
494    // mentions — pull them out so E3 treats a link to a superseded atom as an error. External
495    // URLs are skipped, so `https://example.com/D1` can't masquerade as a citation to D1.
496    let mut link_refs = Vec::new();
497    let mut seen_link: BTreeSet<String> = BTreeSet::new();
498    for dest in links.iter().filter(|d| is_local_link(d)) {
499        for tok in tokenize(dest) {
500            if is_id_shape(tok) && !exclude.contains(tok) && seen_link.insert(tok.to_string()) {
501                link_refs.push(Id(tok.to_string()));
502            }
503        }
504    }
505    bare.retain(|i| !seen_link.contains(&i.0));
506
507    Body {
508        bare_mentions: bare,
509        findings,
510        section_refs,
511        links,
512        link_refs,
513    }
514}
515
516fn scan_ids(text: &str, exclude: &BTreeSet<String>, out: &mut Vec<Id>, seen: &mut BTreeSet<String>) {
517    for tok in tokenize(text) {
518        if is_id_shape(tok) && !exclude.contains(tok) && seen.insert(tok.to_string()) {
519            out.push(Id(tok.to_string()));
520        }
521    }
522}
523
524fn scan_sections(text: &str, out: &mut Vec<String>) {
525    let cs: Vec<char> = text.chars().collect();
526    let mut i = 0;
527    while i < cs.len() {
528        if cs[i] == '§' {
529            let mut s = String::new();
530            let mut j = i + 1;
531            // A section number is digit groups joined by single dots — a dot is only part
532            // of the ref if a digit follows it, so trailing prose dots ("§9.6.") don't leak.
533            while j < cs.len() {
534                if cs[j].is_ascii_digit() {
535                    s.push(cs[j]);
536                    j += 1;
537                } else if cs[j] == '.' && j + 1 < cs.len() && cs[j + 1].is_ascii_digit() {
538                    s.push('.');
539                    j += 1;
540                } else {
541                    break;
542                }
543            }
544            if !s.is_empty() {
545                out.push(s);
546            }
547            i = j;
548        } else {
549            i += 1;
550        }
551    }
552}
553
554/// Remove `<!-- … -->` spans so anchors commented out in the source aren't scanned. An
555/// unterminated comment swallows the rest of the fragment (matching HTML behaviour).
556fn strip_html_comments(html: &str) -> String {
557    let mut out = String::with_capacity(html.len());
558    let mut rest = html;
559    while let Some(start) = rest.find("<!--") {
560        out.push_str(&rest[..start]);
561        match rest[start + 4..].find("-->") {
562            Some(end) => rest = &rest[start + 4 + end + 3..],
563            None => return out,
564        }
565    }
566    out.push_str(rest);
567    out
568}
569
570/// Findings come only from `<a id="…">` anchors — not `<div id=…>`, `data-id=`, comments.
571fn scan_anchors(html: &str, out: &mut Vec<Finding>, seen: &mut BTreeSet<String>) {
572    let html = strip_html_comments(html);
573    let lower = html.to_ascii_lowercase();
574    let lb = lower.as_bytes();
575    let mut i = 0;
576    while let Some(rel) = lower[i..].find("<a") {
577        let start = i + rel;
578        let after = start + 2;
579        // Genuine <a tag: "<a" followed by a delimiter, not e.g. "<article".
580        let is_anchor =
581            after >= lb.len() || matches!(lb[after], b' ' | b'\t' | b'\n' | b'\r' | b'>' | b'/');
582        i = after;
583        if !is_anchor {
584            continue;
585        }
586        let end = lower[start..].find('>').map(|e| start + e).unwrap_or(lb.len());
587        if let Some(val) = anchor_id(&html[start..end]) {
588            if is_finding_shape(val) && seen.insert(val.to_string()) {
589                out.push(Finding {
590                    id: FindingId(val.to_string()),
591                    status: String::new(), // inline status parsing is Phase 3/4
592                });
593            }
594        }
595        i = end;
596    }
597}
598
599/// Read the `id` attribute's value out of a single opening-tag slice, requiring `id` to be
600/// a whole attribute name (so `data-id` / `grid` don't match).
601fn anchor_id(tag: &str) -> Option<&str> {
602    let lower = tag.to_ascii_lowercase();
603    let lb = lower.as_bytes();
604    let mut from = 0;
605    while let Some(rel) = lower[from..].find("id") {
606        let at = from + rel;
607        from = at + 2;
608        let prev_ok = at == 0
609            || !(lb[at - 1].is_ascii_alphanumeric() || lb[at - 1] == b'-' || lb[at - 1] == b'_');
610        let mut k = at + 2;
611        while k < lb.len() && (lb[k] == b' ' || lb[k] == b'\t') {
612            k += 1;
613        }
614        if !(prev_ok && k < lb.len() && lb[k] == b'=') {
615            continue;
616        }
617        k += 1;
618        while k < lb.len() && (lb[k] == b' ' || lb[k] == b'\t') {
619            k += 1;
620        }
621        if k < lb.len() && (lb[k] == b'"' || lb[k] == b'\'') {
622            let q = lb[k] as char;
623            let vstart = k + 1;
624            if let Some(e) = lower[vstart..].find(q) {
625                return Some(&tag[vstart..vstart + e]);
626            }
627        }
628    }
629    None
630}
631
632/// Split on anything that isn't an ASCII alphanumeric or `-`, preserving char boundaries.
633fn tokenize(text: &str) -> Vec<&str> {
634    let mut out = Vec::new();
635    let mut start = None;
636    for (i, c) in text.char_indices() {
637        let is_tok = c.is_ascii_alphanumeric() || c == '-';
638        match (is_tok, start) {
639            (true, None) => start = Some(i),
640            (false, Some(s)) => {
641                out.push(&text[s..i]);
642                start = None;
643            }
644            _ => {}
645        }
646    }
647    if let Some(s) = start {
648        out.push(&text[s..]);
649    }
650    out
651}
652
653/// Record-id shape: 1+ uppercase letters, optional `-`, 1+ digits (e.g. `D2`, `F10`, `CD-3`).
654fn is_id_shape(s: &str) -> bool {
655    let b = s.as_bytes();
656    let mut i = 0;
657    while i < b.len() && b[i].is_ascii_uppercase() {
658        i += 1;
659    }
660    if i == 0 {
661        return false;
662    }
663    if i < b.len() && b[i] == b'-' {
664        i += 1;
665    }
666    let dstart = i;
667    while i < b.len() && b[i].is_ascii_digit() {
668        i += 1;
669    }
670    i > dstart && i == b.len()
671}
672
673/// Finding-anchor shape: `[a-z][a-z0-9-]*` ending in a digit (e.g. `r9`, `t6`, `am-3`).
674fn is_finding_shape(s: &str) -> bool {
675    let b = s.as_bytes();
676    !b.is_empty()
677        && b[0].is_ascii_lowercase()
678        && b.iter().all(|&c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == b'-')
679        && b[b.len() - 1].is_ascii_digit()
680}
681
682/// A link points within the corpus (relative path or `#anchor`) rather than out to the web.
683/// External links carry a URI scheme (`https://…`, `mailto:…`) or are protocol-relative.
684fn is_local_link(dest: &str) -> bool {
685    let d = dest.trim();
686    if d.starts_with("//") || d.contains("://") {
687        return false;
688    }
689    // `scheme:` (mailto:, tel:, …) — a scheme is `[a-z][a-z0-9+.-]*` before the first colon.
690    if let Some(colon) = d.find(':') {
691        let scheme = &d[..colon];
692        let looks_like_scheme = scheme
693            .chars()
694            .next()
695            .is_some_and(|c| c.is_ascii_alphabetic())
696            && scheme.chars().all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.'));
697        if looks_like_scheme {
698            return false;
699        }
700    }
701    true
702}
703
704fn dedup_preserve(v: &mut Vec<String>) {
705    let mut seen = BTreeSet::new();
706    v.retain(|x| seen.insert(x.clone()));
707}
708
709#[cfg(test)]
710mod tests {
711    use super::*;
712
713    const DECISION: &str = r#"+++
714id = "D2"
715kind = "decision"
716lifecycle = "current"
717authority = "normative"
718status = "accepted"
719date = 2026-06-18
720last_reviewed = 2026-06-20
721depends_on = ["A1"]
722supersedes = ["D1"]
723+++
724
725This supersedes the old approach; see §9.6 and the open fork F5.
726
727```text
728let y = D9;   // inside a code block — must be ignored
729```
730
731It also discusses D7 in prose, and A1 (already a structured edge).
732"#;
733
734    fn parse(text: &str) -> Result<Record, ParseError> {
735        TomlFormat.parse(&DocPath("x.md".into()), text)
736    }
737
738    #[test]
739    fn parses_decision_header() {
740        let r = parse(DECISION).expect("should parse");
741        assert_eq!(r.id, Some(Id("D2".into())));
742        assert_eq!(r.kind, Kind::Decision);
743        assert_eq!(r.lifecycle, Lifecycle::Current);
744        assert_eq!(r.last_reviewed, Date("2026-06-20".into()));
745        assert_eq!(r.edges.depends_on, vec![Id("A1".into())]);
746        assert_eq!(r.edges.supersedes, vec![Id("D1".into())]);
747        match &r.facet {
748            Facet::Decision(d) => assert_eq!(d.status, Status::Accepted),
749            other => panic!("expected decision facet, got {other:?}"),
750        }
751    }
752
753    #[test]
754    fn body_extraction_excludes_code_and_structured_edges() {
755        let r = parse(DECISION).unwrap();
756        let mentions: Vec<&str> = r.body.bare_mentions.iter().map(|i| i.0.as_str()).collect();
757        assert!(mentions.contains(&"F5"), "{mentions:?}");
758        assert!(mentions.contains(&"D7"), "{mentions:?}");
759        assert!(!mentions.contains(&"D9"), "code-block id leaked: {mentions:?}");
760        assert!(!mentions.contains(&"A1"), "structured edge counted as bare: {mentions:?}");
761        assert!(!mentions.contains(&"D2"), "own id counted as bare: {mentions:?}");
762        assert!(!mentions.contains(&"D1"), "supersedes target counted as bare: {mentions:?}");
763        assert_eq!(r.body.section_refs, vec!["9.6".to_string()]);
764    }
765
766    #[test]
767    fn missing_front_matter_errors() {
768        assert!(parse("no front matter here\n").is_err());
769    }
770
771    #[test]
772    fn open_decision_requires_fork_fields() {
773        let missing = r#"+++
774id = "F5"
775kind = "decision"
776lifecycle = "draft"
777authority = "normative"
778status = "open"
779date = 2026-06-18
780last_reviewed = 2026-06-20
781+++
782body
783"#;
784        let err = parse(missing).unwrap_err();
785        assert!(err.0.contains("open decision requires"), "{}", err.0);
786
787        let ok = r#"+++
788id = "F5"
789kind = "decision"
790lifecycle = "draft"
791authority = "normative"
792status = "open"
793date = 2026-06-18
794last_reviewed = 2026-06-20
795lean = "jinn-native symbolic"
796decide_when = "first obligation linear-arith can't discharge"
797door = "reversible"
798+++
799body
800"#;
801        let r = parse(ok).unwrap();
802        match &r.facet {
803            Facet::Decision(d) => {
804                let fork = d.fork.as_ref().expect("open decision should carry a fork");
805                assert_eq!(fork.door, Door::Reversible);
806                assert_eq!(fork.lean, "jinn-native symbolic");
807            }
808            other => panic!("expected decision, got {other:?}"),
809        }
810    }
811
812    #[test]
813    fn finding_anchors_extracted() {
814        let doc = r#"+++
815id = "RL1"
816kind = "review-log"
817lifecycle = "current"
818authority = "historical"
819last_reviewed = 2026-06-20
820+++
821
822<a id="r9"></a> A finding about something.
823<a id="not-a-finding"></a> ignored (no trailing digit).
824"#;
825        let r = parse(doc).unwrap();
826        let ids: Vec<&str> = r.body.findings.iter().map(|f| f.id.0.as_str()).collect();
827        assert_eq!(ids, vec!["r9"]);
828    }
829
830    #[test]
831    fn missing_id_errors() {
832        let doc = r#"+++
833kind = "decision"
834lifecycle = "current"
835authority = "normative"
836status = "accepted"
837date = 2026-06-18
838last_reviewed = 2026-06-20
839+++
840body
841"#;
842        assert!(parse(doc).unwrap_err().0.contains("missing `id`"));
843    }
844
845    #[test]
846    fn external_link_is_not_a_local_citation() {
847        let doc = r#"+++
848id = "A1"
849kind = "architecture"
850lifecycle = "current"
851authority = "normative"
852last_reviewed = 2026-06-20
853+++
854
855See [an unrelated page](https://example.com/D1) and [the local one](./other.md#D2).
856"#;
857        let r = parse(doc).unwrap();
858        let refs: Vec<&str> = r.body.link_refs.iter().map(|i| i.0.as_str()).collect();
859        assert_eq!(refs, vec!["D2"], "external D1 must not become a citation");
860    }
861
862    /// A decision header with one field overridden, for type-strictness tests.
863    fn decision_with(field: &str) -> String {
864        format!(
865            "+++\n\
866             id = \"D2\"\n\
867             kind = \"decision\"\n\
868             lifecycle = \"current\"\n\
869             authority = \"normative\"\n\
870             status = \"accepted\"\n\
871             date = 2026-06-18\n\
872             last_reviewed = 2026-06-20\n\
873             {field}\n\
874             +++\nbody\n"
875        )
876    }
877
878    #[test]
879    fn bare_string_for_array_field_errors() {
880        // `supersedes = "D1"` must not silently collapse to an empty edge list.
881        let err = parse(&decision_with(r#"supersedes = "D1""#)).unwrap_err();
882        assert!(err.0.contains("`supersedes` must be an array"), "{}", err.0);
883    }
884
885    #[test]
886    fn mixed_array_element_errors() {
887        let err = parse(&decision_with(r#"depends_on = ["A1", 2]"#)).unwrap_err();
888        assert!(err.0.contains("depends_on[1]"), "{}", err.0);
889    }
890
891    #[test]
892    fn wrong_typed_enum_errors() {
893        // A non-string status is an error, not "absent".
894        let doc = parse(&decision_with("").replace(r#"status = "accepted""#, "status = 5"));
895        assert!(doc.unwrap_err().0.contains("`status` must be a string"));
896    }
897
898    #[test]
899    fn date_must_be_iso_yyyy_mm_dd() {
900        // A full TOML date-time is rejected.
901        let ts = decision_with("").replace("date = 2026-06-18", "date = 2026-06-18T10:00:00Z");
902        assert!(parse(&ts).unwrap_err().0.contains("date"), "timestamp should be rejected");
903        // A non-ISO date string is rejected.
904        let bad = decision_with("").replace("date = 2026-06-18", r#"date = "June 18, 2026""#);
905        assert!(parse(&bad).unwrap_err().0.contains("date"), "free-form date should be rejected");
906        // A quoted ISO date string is accepted.
907        let ok = decision_with("").replace("date = 2026-06-18", r#"date = "2026-06-18""#);
908        assert!(parse(&ok).is_ok());
909    }
910
911    #[test]
912    fn section_ref_drops_trailing_dot() {
913        let doc = r#"+++
914id = "EX1"
915kind = "explainer"
916lifecycle = "current"
917authority = "explanatory"
918last_reviewed = 2026-06-20
919+++
920
921As discussed in §9.6. and also §2, the design holds.
922"#;
923        let r = parse(doc).unwrap();
924        assert_eq!(r.body.section_refs, vec!["9.6".to_string(), "2".to_string()]);
925    }
926
927    #[test]
928    fn findings_only_from_anchor_tags() {
929        let doc = r#"+++
930id = "RL2"
931kind = "review-log"
932lifecycle = "current"
933authority = "historical"
934last_reviewed = 2026-06-20
935+++
936
937<a id="r9"></a> a real finding.
938<div id="d9"></div> a div, not an anchor.
939<a data-id="x9"></a> data-id, not the id attribute.
940<article id="t3"></article> not an <a> tag.
941"#;
942        let r = parse(doc).unwrap();
943        let ids: Vec<&str> = r.body.findings.iter().map(|f| f.id.0.as_str()).collect();
944        assert_eq!(ids, vec!["r9"]);
945    }
946
947    #[test]
948    fn quoted_date_calendar_validated() {
949        for bad in ["2026-99-99", "2026-13-01", "2026-00-10", "2026-02-30", "2026-04-31"] {
950            let doc = decision_with("").replace("date = 2026-06-18", &format!(r#"date = "{bad}""#));
951            assert!(parse(&doc).is_err(), "{bad} should be rejected");
952        }
953        let leap = decision_with("").replace("date = 2026-06-18", r#"date = "2024-02-29""#);
954        assert!(parse(&leap).is_ok(), "valid leap day should parse");
955    }
956
957    #[test]
958    fn commented_out_anchors_are_not_findings() {
959        let doc = r#"+++
960id = "RL3"
961kind = "review-log"
962lifecycle = "current"
963authority = "historical"
964last_reviewed = 2026-06-20
965+++
966
967<!-- <a id="r9"></a> this finding is commented out -->
968<a id="t3"></a> a real one.
969"#;
970        let r = parse(doc).unwrap();
971        let ids: Vec<&str> = r.body.findings.iter().map(|f| f.id.0.as_str()).collect();
972        assert_eq!(ids, vec!["t3"]);
973    }
974}