Skip to main content

mur_common/skill/
parser.rs

1//! Dual-format parser. Canonical YAML is the source of truth; markdown
2//! frontmatter is the human-authoring surface that round-trips via
3//! `canonical_from_markdown()` / `markdown_from_canonical()`.
4
5use super::manifest::SkillManifest;
6use std::fmt;
7
8#[derive(Debug)]
9pub enum ParseError {
10    Yaml(serde_yaml_ng::Error),
11    MissingFrontmatter,
12    MalformedFrontmatter(String),
13    LegacyMarkdown(String),
14}
15
16impl fmt::Display for ParseError {
17    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
18        match self {
19            ParseError::Yaml(e) => write!(f, "yaml parse: {e}"),
20            ParseError::MissingFrontmatter => write!(f, "missing `---` frontmatter delimiters"),
21            ParseError::MalformedFrontmatter(s) => write!(f, "malformed frontmatter: {s}"),
22            ParseError::LegacyMarkdown(s) => write!(f, "legacy markdown: {s}"),
23        }
24    }
25}
26
27impl std::error::Error for ParseError {}
28
29impl From<serde_yaml_ng::Error> for ParseError {
30    fn from(e: serde_yaml_ng::Error) -> Self {
31        ParseError::Yaml(e)
32    }
33}
34
35/// Parse canonical `skill.yaml`.
36pub fn parse_canonical(yaml: &str) -> Result<SkillManifest, ParseError> {
37    let m: SkillManifest = serde_yaml_ng::from_str(yaml)?;
38    Ok(m)
39}
40
41/// Serialise a `SkillManifest` to canonical YAML. Deterministic field order
42/// matches the struct definition.
43pub fn serialize_canonical(m: &SkillManifest) -> Result<String, ParseError> {
44    Ok(serde_yaml_ng::to_string(m)?)
45}
46
47/// Parse markdown-frontmatter skill source. Frontmatter (between two `---`
48/// fences) is YAML; the body becomes `content.abstract` plus — if it has a
49/// `## Steps` heading — a synthesised `content.procedure`, or otherwise a
50/// `content.context`. This is the human-authoring surface; canonical YAML
51/// remains source of truth on disk.
52pub fn parse_markdown(input: &str) -> Result<SkillManifest, ParseError> {
53    let (frontmatter, body) = split_frontmatter(input)?;
54    let mut value: serde_yaml_ng::Value = serde_yaml_ng::from_str(frontmatter)?;
55    inject_content_from_body(&mut value, body)?;
56    let m: SkillManifest = serde_yaml_ng::from_value(value)?;
57    Ok(m)
58}
59
60fn split_frontmatter(input: &str) -> Result<(&str, &str), ParseError> {
61    let trimmed = input.trim_start_matches('\u{feff}');
62    let trimmed = trimmed
63        .strip_prefix("---")
64        .ok_or(ParseError::MissingFrontmatter)?;
65    let trimmed = trimmed.strip_prefix('\n').unwrap_or(trimmed);
66    let end = trimmed
67        .find("\n---")
68        .ok_or_else(|| ParseError::MalformedFrontmatter("missing closing `---`".into()))?;
69    let frontmatter = &trimmed[..end];
70    let after = &trimmed[end + 4..];
71    let body = after.strip_prefix('\n').unwrap_or(after);
72    Ok((frontmatter, body))
73}
74
75fn inject_content_from_body(
76    value: &mut serde_yaml_ng::Value,
77    body: &str,
78) -> Result<(), ParseError> {
79    use serde_yaml_ng::Value;
80
81    if let Some(map) = value.as_mapping_mut() {
82        if map.contains_key(Value::String("content".into())) {
83            return Ok(()); // frontmatter already supplied content
84        }
85        let mut content = serde_yaml_ng::Mapping::new();
86
87        if body.contains("## Steps") {
88            // Workflow mode: abstract is the prose before `## Steps`, with the
89            // optional leading `# Title` H1 stripped.
90            let abstract_text = strip_leading_h1(body)
91                .split("## Steps")
92                .next()
93                .unwrap_or("")
94                .trim()
95                .to_string();
96            content.insert(
97                Value::String("abstract".into()),
98                Value::String(abstract_text),
99            );
100            let proc = build_procedure_from_steps(body);
101            content.insert(Value::String("procedure".into()), proc);
102        } else {
103            // Context mode: drop the optional `# Title` H1, take the first
104            // non-empty paragraph as the abstract, and the remainder (trimmed)
105            // as the context body. This is the inverse of `serialize_markdown`.
106            let (abstract_text, context_text) = split_abstract_and_context(body);
107            content.insert(
108                Value::String("abstract".into()),
109                Value::String(abstract_text),
110            );
111            content.insert(Value::String("context".into()), Value::String(context_text));
112        }
113        map.insert(Value::String("content".into()), Value::Mapping(content));
114    } else {
115        return Err(ParseError::MalformedFrontmatter(
116            "frontmatter is not a mapping".into(),
117        ));
118    }
119    Ok(())
120}
121
122/// Drop a single leading `# Title` H1 line (and the blank line that follows it)
123/// from a markdown body, returning the remainder. If there is no leading H1 the
124/// body is returned unchanged.
125fn strip_leading_h1(body: &str) -> &str {
126    let trimmed = body.trim_start_matches(['\n', '\r']);
127    if let Some(rest) = trimmed.strip_prefix("# ") {
128        // Skip to the end of the title line.
129        match rest.find('\n') {
130            Some(nl) => rest[nl + 1..].trim_start_matches(['\n', '\r']),
131            None => "",
132        }
133    } else {
134        trimmed
135    }
136}
137
138/// Split a context-mode markdown body into `(abstract, context)`:
139/// drop an optional leading `# Title` H1, take the first non-empty paragraph as
140/// the abstract, and everything after the blank line that follows it as the
141/// context body. The inverse of the `serialize_markdown` context layout.
142fn split_abstract_and_context(body: &str) -> (String, String) {
143    let rest = strip_leading_h1(body);
144    // The abstract is the first paragraph: text up to the first blank line.
145    match rest.find("\n\n") {
146        Some(idx) => {
147            let abstract_text = rest[..idx].trim().to_string();
148            let context_text = rest[idx + 2..].trim().to_string();
149            (abstract_text, context_text)
150        }
151        None => {
152            // No blank line: the whole remainder is the abstract, no context.
153            (rest.trim().to_string(), String::new())
154        }
155    }
156}
157
158fn build_procedure_from_steps(body: &str) -> serde_yaml_ng::Value {
159    use serde_yaml_ng::{Mapping, Value};
160    let mut steps = Vec::new();
161    let mut in_steps = false;
162    for line in body.lines() {
163        if line.trim_start().starts_with("## Steps") {
164            in_steps = true;
165            continue;
166        }
167        if in_steps && line.starts_with("## ") {
168            break;
169        }
170        if in_steps {
171            let trimmed = line.trim();
172            if let Some(rest) = trimmed.strip_prefix("- ").or_else(|| {
173                trimmed.find(". ").and_then(|i| {
174                    let (n, r) = trimmed.split_at(i);
175                    n.chars().all(|c| c.is_ascii_digit()).then(|| &r[2..])
176                })
177            }) {
178                let mut step = Mapping::new();
179                step.insert(
180                    Value::String("description".into()),
181                    Value::String(rest.to_string()),
182                );
183                steps.push(Value::Mapping(step));
184            }
185        }
186    }
187    let mut procedure = Mapping::new();
188    procedure.insert(Value::String("steps".into()), Value::Sequence(steps));
189    Value::Mapping(procedure)
190}
191
192/// Render a `SkillManifest` back to markdown frontmatter form. The body is
193/// derived from the populated content mode: `context` → context body,
194/// `procedure` → "## Steps" list, `command` → fenced block.
195pub fn serialize_markdown(m: &SkillManifest) -> Result<String, ParseError> {
196    let frontmatter = serialize_canonical_frontmatter(m)?;
197    let mut out = String::new();
198    out.push_str("---\n");
199    out.push_str(&frontmatter);
200    out.push_str("---\n\n");
201    // Single H1 title, then the abstract paragraph, then the body. The abstract
202    // is emitted verbatim (trimmed) followed by a blank line so the parser can
203    // recover it as the first paragraph. Do NOT emit a duplicate H1.
204    out.push_str(&format!("# {}\n\n", m.name));
205    out.push_str(m.content.r#abstract.trim());
206    out.push('\n');
207    if let Some(ctx) = &m.content.context {
208        out.push('\n');
209        out.push_str(ctx.trim_end());
210        out.push('\n');
211    } else if let Some(proc) = &m.content.procedure {
212        out.push_str("\n## Steps\n");
213        for (i, s) in proc.steps.iter().enumerate() {
214            out.push_str(&format!("{}. {}\n", i + 1, s.description));
215        }
216    } else if let Some(cmd) = &m.content.command {
217        out.push_str("\n## Command\n\n```\n");
218        out.push_str(cmd);
219        out.push_str("\n```\n");
220    }
221    Ok(out)
222}
223
224/// Frontmatter is the manifest serialised *without* the `content` field —
225/// the content moves into the markdown body.
226fn serialize_canonical_frontmatter(m: &SkillManifest) -> Result<String, ParseError> {
227    let mut value = serde_yaml_ng::to_value(m)?;
228    if let Some(map) = value.as_mapping_mut() {
229        map.remove(serde_yaml_ng::Value::String("content".into()));
230    }
231    Ok(serde_yaml_ng::to_string(&value)?)
232}
233
234/// Parse a legacy skill file — pre-M0 markdown with minimal frontmatter
235/// (just `name` + `description`). Fills in defaults so the file can be
236/// loaded by the new pipeline without rewriting it.
237pub fn parse_legacy_markdown(input: &str) -> Result<SkillManifest, ParseError> {
238    let (frontmatter, body) = split_frontmatter(input)?;
239    let mut value: serde_yaml_ng::Value = serde_yaml_ng::from_str(frontmatter)?;
240    let map = value
241        .as_mapping_mut()
242        .ok_or_else(|| ParseError::LegacyMarkdown("frontmatter is not a mapping".into()))?;
243    use serde_yaml_ng::Value;
244    let key = |k: &str| Value::String(k.into());
245    map.entry(key("version"))
246        .or_insert(Value::String("0.0.0".into()));
247    map.entry(key("publisher"))
248        .or_insert(Value::String("human:mur".into()));
249    map.entry(key("category"))
250        .or_insert(Value::String("context".into()));
251    inject_content_from_body(&mut value, body)?;
252    let m: SkillManifest = serde_yaml_ng::from_value(value)?;
253    Ok(m)
254}
255
256/// Convenience: parse canonical YAML, serialise back to markdown.
257/// Used by `ensure_mur_skill` so built-in yaml skills produce
258/// AI-tool-consumable markdown at `SKILL.md`.
259pub fn yaml_to_markdown(yaml: &str) -> Result<String, ParseError> {
260    let m = parse_canonical(yaml)?;
261    serialize_markdown(&m)
262}
263
264/// Round-trip integrity guard: serialise the manifest to markdown, parse it
265/// back, and confirm the content survived. Used by `mur skill validate` to make
266/// silent abstract/context corruption visible. Only the `context` content mode
267/// is checked verbatim — procedure/command/note bodies are reconstructed
268/// structurally and compared by mode. Returns `Err(message)` describing the
269/// drift if the round-trip would alter content.
270pub fn roundtrip_check(m: &SkillManifest) -> Result<(), String> {
271    let md = serialize_markdown(m).map_err(|e| format!("serialize_markdown: {e}"))?;
272    let reparsed = parse_markdown(&md).map_err(|e| format!("parse_markdown: {e}"))?;
273
274    if reparsed.content.r#abstract.trim() != m.content.r#abstract.trim() {
275        return Err(format!(
276            "abstract differs after round-trip\n  original:  {:?}\n  roundtrip: {:?}",
277            m.content.r#abstract, reparsed.content.r#abstract
278        ));
279    }
280
281    // Context is stored verbatim; compare trimmed to ignore trailing-newline noise.
282    let orig_ctx = m.content.context.as_deref().map(str::trim_end);
283    let rt_ctx = reparsed.content.context.as_deref().map(str::trim_end);
284    if orig_ctx != rt_ctx {
285        return Err(format!(
286            "context differs after round-trip\n  original:  {orig_ctx:?}\n  roundtrip: {rt_ctx:?}"
287        ));
288    }
289
290    Ok(())
291}
292
293#[cfg(test)]
294mod tests {
295    use super::*;
296
297    const SAMPLE: &str = r#"
298name: demo-skill
299version: 0.1.0
300publisher: human:test
301description: Demo
302category: context
303content:
304  abstract: hello
305  context: |
306    body
307"#;
308
309    #[test]
310    fn parses_canonical_yaml() {
311        let m = parse_canonical(SAMPLE).unwrap();
312        assert_eq!(m.name, "demo-skill");
313        assert_eq!(m.content.context.as_deref(), Some("body\n"));
314    }
315
316    #[test]
317    fn serialize_then_reparse_is_identity() {
318        let m = parse_canonical(SAMPLE).unwrap();
319        let yaml = serialize_canonical(&m).unwrap();
320        let m2 = parse_canonical(&yaml).unwrap();
321        assert_eq!(m.name, m2.name);
322        assert_eq!(m.content.context, m2.content.context);
323    }
324
325    #[test]
326    fn rejects_non_yaml_input() {
327        let r = parse_canonical("this is not yaml ::: {{");
328        assert!(r.is_err());
329    }
330
331    #[test]
332    fn parses_markdown_frontmatter_to_context_mode() {
333        let md = r#"---
334name: simple-md
335version: 1.0.0
336publisher: human:test
337description: A markdown skill
338category: context
339---
340
341# simple-md
342
343Some context content here.
344"#;
345        let m = parse_markdown(md).unwrap();
346        assert_eq!(m.name, "simple-md");
347        assert!(m.content.context.is_some());
348        assert!(m.content.procedure.is_none());
349    }
350
351    #[test]
352    fn parses_markdown_with_steps_to_workflow_mode() {
353        let md = r#"---
354name: with-steps
355version: 1.0.0
356publisher: human:test
357description: A workflow
358category: workflow
359---
360
361# with-steps
362
363Does a thing.
364
365## Steps
3661. Navigate somewhere
3672. Click the button
368- Final extraction step
369"#;
370        let m = parse_markdown(md).unwrap();
371        let proc = m.content.procedure.expect("procedure populated");
372        assert_eq!(proc.steps.len(), 3);
373        assert_eq!(proc.steps[0].description, "Navigate somewhere");
374    }
375
376    #[test]
377    fn markdown_without_frontmatter_fails() {
378        let md = "# just a heading\n";
379        assert!(matches!(
380            parse_markdown(md),
381            Err(ParseError::MissingFrontmatter)
382        ));
383    }
384
385    #[test]
386    fn canonical_to_markdown_roundtrips_context() {
387        let m = parse_canonical(SAMPLE).unwrap();
388        let md = serialize_markdown(&m).unwrap();
389        let m2 = parse_markdown(&md).unwrap();
390        assert_eq!(m.name, m2.name);
391        assert_eq!(m.content.context.is_some(), m2.content.context.is_some());
392    }
393
394    #[test]
395    fn canonical_to_markdown_roundtrips_workflow() {
396        let yaml = r#"
397name: w
398version: 1.0.0
399publisher: human:test
400description: d
401category: workflow
402content:
403  abstract: a
404  procedure:
405    steps:
406      - description: First
407      - description: Second
408"#;
409        let m = parse_canonical(yaml).unwrap();
410        let md = serialize_markdown(&m).unwrap();
411        let m2 = parse_markdown(&md).unwrap();
412        let p2 = m2.content.procedure.unwrap();
413        assert_eq!(p2.steps.len(), 2);
414        assert_eq!(p2.steps[0].description, "First");
415    }
416
417    #[test]
418    fn legacy_minimal_frontmatter_loads() {
419        let md =
420            "---\nname: mur-context\ndescription: Background context\n---\n\n# MUR\n\nSome body.\n";
421        let m = parse_legacy_markdown(md).unwrap();
422        assert_eq!(m.name, "mur-context");
423        assert_eq!(m.publisher, "human:mur");
424        assert_eq!(m.version, "0.0.0");
425        assert!(m.content.context.is_some());
426    }
427
428    #[test]
429    fn yaml_to_markdown_yields_consumable_md() {
430        let md = yaml_to_markdown(SAMPLE).unwrap();
431        assert!(md.starts_with("---"), "should start with frontmatter fence");
432        assert!(md.contains("# demo-skill"), "should contain heading");
433        assert!(md.contains("hello"), "should contain abstract");
434        assert!(md.contains("body"), "should contain context body");
435    }
436
437    /// A deliberate multi-sentence abstract plus a multi-paragraph context (with
438    /// a code fence and a `## Heading`) must survive yaml→md→yaml without loss.
439    #[test]
440    fn yaml_md_yaml_roundtrip_preserves_abstract_and_context() {
441        let yaml = r#"
442name: roundtrip-demo
443version: 2.3.1
444publisher: human:alan
445description: A skill exercising lossless round-trip.
446category: context
447tags: [alpha, beta]
448triggers:
449  - type: keyword
450    pattern: roundtrip
451content:
452  abstract: |-
453    This skill does one specific thing. It is described in two full sentences so
454    the truncation bug would be obvious.
455  context: |-
456    First context paragraph with real prose that should survive verbatim.
457
458    ## A Heading
459
460    Some explanation under the heading.
461
462    ```rust
463    fn demo() {
464        println!("code fence must survive");
465    }
466    ```
467
468    Final closing paragraph.
469"#;
470        let m = parse_canonical(yaml).unwrap();
471        let md = serialize_markdown(&m).unwrap();
472        let m2 = parse_markdown(&md).unwrap();
473
474        assert_eq!(
475            m2.content.r#abstract, m.content.r#abstract,
476            "abstract must round-trip exactly"
477        );
478        assert_eq!(
479            m2.content.context, m.content.context,
480            "context must round-trip exactly"
481        );
482        // Other manifest fields survive via frontmatter.
483        assert_eq!(m2.version, m.version);
484        assert_eq!(m2.publisher, m.publisher);
485        assert_eq!(m2.tags, m.tags);
486        assert_eq!(m2.triggers.len(), m.triggers.len());
487        assert_eq!(m2.triggers[0].pattern, m.triggers[0].pattern);
488    }
489
490    /// A hand-authored markdown (title + abstract paragraph + body) parses, then
491    /// serializing and parsing again yields an identical manifest (idempotent).
492    #[test]
493    fn md_yaml_md_roundtrip_stable() {
494        let md = r#"---
495name: handauthored
496version: 1.2.0
497publisher: human:alan
498description: Hand-authored markdown skill.
499category: context
500---
501
502# handauthored
503
504This is the abstract. It spans two sentences on purpose.
505
506This is the first body paragraph.
507
508## Details
509
510More body content here, including a list:
511
512- one
513- two
514"#;
515        let m1 = parse_markdown(md).unwrap();
516        let md2 = serialize_markdown(&m1).unwrap();
517        let m2 = parse_markdown(&md2).unwrap();
518
519        assert_eq!(
520            m1.content.r#abstract, m2.content.r#abstract,
521            "abstract must be stable across md→yaml→md"
522        );
523        assert_eq!(
524            m1.content.context, m2.content.context,
525            "context must be stable across md→yaml→md"
526        );
527        assert_eq!(m1.name, m2.name);
528        assert_eq!(m1.version, m2.version);
529        // The abstract must NOT be the bare H1 title (the old truncation bug).
530        assert_ne!(m1.content.r#abstract.trim(), "# handauthored");
531        assert!(
532            m1.content.r#abstract.contains("This is the abstract."),
533            "abstract should be the first paragraph, got: {:?}",
534            m1.content.r#abstract
535        );
536    }
537
538    #[test]
539    fn roundtrip_check_passes_for_faithful_context_skill() {
540        let m = parse_canonical(SAMPLE).unwrap();
541        assert!(roundtrip_check(&m).is_ok());
542    }
543
544    #[test]
545    fn roundtrip_check_passes_for_multiparagraph_abstract_and_context() {
546        let yaml = r#"
547name: rc-demo
548version: 1.0.0
549publisher: human:test
550description: d
551category: context
552content:
553  abstract: |-
554    Sentence one of the abstract. Sentence two of the abstract.
555  context: |-
556    Para one.
557
558    Para two with a fence:
559
560    ```
561    code
562    ```
563"#;
564        let m = parse_canonical(yaml).unwrap();
565        assert!(
566            roundtrip_check(&m).is_ok(),
567            "expected faithful round-trip, got: {:?}",
568            roundtrip_check(&m)
569        );
570    }
571}