Skip to main content

supersigil_parser/
lib.rs

1//! Parsing pipeline for supersigil spec documents.
2//!
3//! Documents use standard Markdown with `supersigil-xml` fenced code blocks
4//! for component markup.
5
6mod frontmatter;
7mod markdown_fences;
8mod preprocess;
9/// Shared utility functions used across parser stages.
10pub mod util;
11mod xml_extract;
12mod xml_parser;
13
14pub use frontmatter::{FrontMatterResult, deserialize_front_matter, extract_front_matter};
15pub use markdown_fences::{MarkdownFences, XmlFence, extract_markdown_fences};
16pub use preprocess::{normalize, preprocess};
17pub use xml_extract::extract_components_from_xml;
18pub use xml_parser::{XmlNode, parse_supersigil_xml};
19
20use std::path::Path;
21
22use supersigil_core::{ComponentDefs, ExtractedComponent, ParseError, ParseResult, SpecDocument};
23
24/// A recovered parse result that may include a partial document alongside
25/// fatal validation errors.
26#[derive(Debug)]
27pub struct RecoveredParse {
28    /// The parse result. `Document` may be present even when `fatal_errors`
29    /// is non-empty, allowing best-effort local features to use the partial
30    /// component tree.
31    pub result: ParseResult,
32    /// Fatal errors produced after enough structure was recovered to build a
33    /// partial `SpecDocument`.
34    pub fatal_errors: Vec<ParseError>,
35}
36
37// ---------------------------------------------------------------------------
38// Lint-time validation (format-agnostic)
39// ---------------------------------------------------------------------------
40
41/// Validate extracted components against the known component definitions.
42///
43/// Checks missing required attributes -> `MissingRequiredAttribute` error.
44///
45/// Only known components reach this point (unknown `PascalCase` elements are
46/// filtered out during extraction), so every component here has a definition.
47/// Recurses into children.
48pub fn validate_components(
49    components: &[ExtractedComponent],
50    component_defs: &ComponentDefs,
51    path: &Path,
52    errors: &mut Vec<ParseError>,
53) {
54    for comp in components {
55        if let Some(def) = component_defs.get(&comp.name) {
56            for (attr_name, attr_def) in &def.attributes {
57                if attr_def.required && !comp.attributes.contains_key(attr_name) {
58                    errors.push(ParseError::MissingRequiredAttribute {
59                        path: path.to_path_buf(),
60                        component: comp.name.clone(),
61                        attribute: attr_name.clone(),
62                        position: comp.position,
63                    });
64                }
65            }
66        }
67        validate_components(&comp.children, component_defs, path, errors);
68    }
69}
70
71// ---------------------------------------------------------------------------
72// parse_content — public API (Req 8-1)
73// ---------------------------------------------------------------------------
74
75/// Parse a spec document from an in-memory string into a [`ParseResult`].
76///
77/// This is the core of the parsing pipeline, operating on a content string
78/// that has already been decoded and normalized (e.g. by the LSP buffer or
79/// by [`parse_file`] after preprocessing). It performs:
80///
81/// 1. **Front matter** — extraction and deserialization (fatal on error).
82/// 2. **Markdown fence extraction** — parse the body as standard Markdown
83///    and collect `supersigil-xml` fenced code blocks.
84/// 3. **XML parsing** — parse each `supersigil-xml` fence into structured
85///    XML nodes. Errors in one fence do not prevent parsing of others.
86/// 4. **Component extraction** — walk XML nodes and extract known components.
87/// 5. **Lint-time validation** — check required attributes, etc.
88///
89/// Stage 1 errors are fatal and prevent all later stages.
90///
91/// # Errors
92///
93/// Returns `Vec<ParseError>` containing all detected errors across stages.
94pub fn parse_content_recovering(
95    path: &Path,
96    content: &str,
97    component_defs: &ComponentDefs,
98) -> Result<RecoveredParse, Vec<ParseError>> {
99    // Stage 1: Extract front matter
100    let (yaml, body) = match extract_front_matter(content, path) {
101        Ok(Some((yaml, body))) => (yaml, body),
102        Ok(None) => {
103            return Ok(RecoveredParse {
104                result: ParseResult::NotSupersigil(path.to_path_buf()),
105                fatal_errors: Vec::new(),
106            });
107        }
108        Err(e) => return Err(vec![e]),
109    };
110
111    // Stage 1: Deserialize front matter
112    let (frontmatter, extra) = match deserialize_front_matter(yaml, path) {
113        Ok(FrontMatterResult::Supersigil { frontmatter, extra }) => (frontmatter, extra),
114        Ok(FrontMatterResult::NotSupersigil) => {
115            return Ok(RecoveredParse {
116                result: ParseResult::NotSupersigil(path.to_path_buf()),
117                fatal_errors: Vec::new(),
118            });
119        }
120        Err(e) => return Err(vec![e]),
121    };
122
123    // Compute body offset for source position adjustment.
124    // body starts at content[body_offset..], so:
125    let body_offset = content.len() - body.len();
126
127    // Stage 2: Parse Markdown body and extract supersigil-xml fences
128    let fences = extract_markdown_fences(body, body_offset);
129
130    // Stage 3: Parse XML content from each supersigil-xml fence
131    let mut errors = Vec::new();
132    let mut all_components = Vec::new();
133    for fence in &fences.xml_fences {
134        match parse_supersigil_xml(&fence.content, fence.content_offset, path) {
135            Ok(nodes) => {
136                let mut comps = extract_components_from_xml(&nodes, content, component_defs);
137                all_components.append(&mut comps);
138            }
139            Err(e) => {
140                // Adjust fence-relative line to file-absolute
141                let adjusted = match e {
142                    ParseError::XmlSyntaxError {
143                        path,
144                        line,
145                        column,
146                        message,
147                    } => {
148                        // Compute the line number where this fence starts in the file
149                        let fence_start_line = content[..fence.content_offset]
150                            .chars()
151                            .filter(|&c| c == '\n')
152                            .count();
153                        ParseError::XmlSyntaxError {
154                            path,
155                            line: line + fence_start_line,
156                            column,
157                            message,
158                        }
159                    }
160                    other => other,
161                };
162                errors.push(adjusted);
163            }
164        }
165    }
166
167    // Stage 4: Lint-time validation
168    validate_components(&all_components, component_defs, path, &mut errors);
169
170    Ok(RecoveredParse {
171        result: ParseResult::Document(SpecDocument {
172            path: path.to_path_buf(),
173            frontmatter,
174            extra,
175            components: all_components,
176        }),
177        fatal_errors: errors,
178    })
179}
180
181/// Parse a spec document from an in-memory string into a [`ParseResult`].
182///
183/// This returns only fully valid documents. Call
184/// [`parse_content_recovering`] when the caller needs best-effort access to
185/// partially valid component trees.
186///
187/// # Errors
188///
189/// Returns `Vec<ParseError>` when front matter, XML parsing, or validation
190/// prevents the document from being considered fully valid.
191pub fn parse_content(
192    path: &Path,
193    content: &str,
194    component_defs: &ComponentDefs,
195) -> Result<ParseResult, Vec<ParseError>> {
196    let recovered = parse_content_recovering(path, content, component_defs)?;
197    if recovered.fatal_errors.is_empty() {
198        Ok(recovered.result)
199    } else {
200        Err(recovered.fatal_errors)
201    }
202}
203
204// ---------------------------------------------------------------------------
205// parse_file — public API (Req 10)
206// ---------------------------------------------------------------------------
207
208/// Parse a single spec file into a [`ParseResult`].
209///
210/// Implements the full parsing pipeline:
211/// 1. Preprocess (UTF-8 decode, BOM strip, CRLF normalize).
212/// 2. Front matter extraction and deserialization.
213/// 3. Markdown fence extraction (`supersigil-xml`).
214/// 4. XML parsing and component extraction.
215/// 5. Lint-time validation.
216///
217/// Stage 1 fatal errors prevent later stages. XML parse errors in one fence
218/// do not prevent other fences from being parsed.
219///
220/// # Errors
221///
222/// Returns `Vec<ParseError>` containing all detected errors across stages.
223pub fn parse_file(
224    path: impl AsRef<Path>,
225    component_defs: &ComponentDefs,
226) -> Result<ParseResult, Vec<ParseError>> {
227    let path = path.as_ref();
228    // Read file
229    let raw = std::fs::read(path).map_err(|e| {
230        vec![ParseError::IoError {
231            path: path.to_path_buf(),
232            source: e,
233        }]
234    })?;
235
236    // Stage 1: Preprocess
237    let content = preprocess(&raw, path).map_err(|e| vec![e])?;
238
239    parse_content(path, &content, component_defs)
240}
241
242// ---------------------------------------------------------------------------
243// Tests
244// ---------------------------------------------------------------------------
245
246#[cfg(test)]
247mod tests {
248    use super::*;
249
250    #[test]
251    fn xml_error_positions_are_file_absolute_not_fence_relative() {
252        // Front matter (3 lines) + blank line + prose line + blank line = 6 lines
253        // Then the fence marker is on line 7, fence content starts on line 8.
254        let content = "\
255---
256supersigil:
257  id: test/err
258  type: requirements
259  status: approved
260---
261
262Some prose here.
263
264```supersigil-xml
265<Criterion id=\"c1\">
266  <?bad processing instruction?>
267</Criterion>
268```
269";
270        let defs = ComponentDefs::defaults();
271        let errors = parse_content(Path::new("test.md"), content, &defs).unwrap_err();
272        assert!(!errors.is_empty(), "should have at least one error");
273
274        // Find the XML syntax error
275        let xml_err = errors
276            .iter()
277            .find(|e| matches!(e, ParseError::XmlSyntaxError { .. }))
278            .expect("should have an XmlSyntaxError");
279
280        if let ParseError::XmlSyntaxError { line, .. } = xml_err {
281            // The processing instruction is on line 2 within the fence,
282            // but the fence content starts on line 11 of the file.
283            // So the error line should be > 10 (file-absolute).
284            assert!(
285                *line > 2,
286                "error line should be file-absolute (got {line}, fence-relative would be 2)"
287            );
288        }
289    }
290
291    #[test]
292    fn xml_syntax_error_remains_fatal() {
293        let content = "\
294---
295supersigil:
296  id: test/fatal
297  type: requirements
298  status: approved
299---
300
301```supersigil-xml
302<Criterion id=\"c1\">
303  <?bad processing instruction?>
304</Criterion>
305```
306";
307        let defs = ComponentDefs::defaults();
308        let result = parse_content(Path::new("test.md"), content, &defs);
309
310        assert!(result.is_err(), "XML syntax error should still be fatal");
311        let errors = result.unwrap_err();
312        assert!(
313            errors
314                .iter()
315                .any(|e| matches!(e, ParseError::XmlSyntaxError { .. })),
316            "should contain XmlSyntaxError"
317        );
318    }
319
320    #[test]
321    fn missing_required_attribute_remains_fatal() {
322        // A Criterion component without the required `id` attribute.
323        let content = "\
324---
325supersigil:
326  id: test/missing-attr
327  type: requirements
328  status: approved
329---
330
331```supersigil-xml
332<Criterion>
333  some text
334</Criterion>
335```
336";
337        let defs = ComponentDefs::defaults();
338        let result = parse_content(Path::new("test.md"), content, &defs);
339
340        assert!(
341            result.is_err(),
342            "MissingRequiredAttribute should still be fatal"
343        );
344        let errors = result.unwrap_err();
345        assert!(
346            errors
347                .iter()
348                .any(|e| matches!(e, ParseError::MissingRequiredAttribute { .. })),
349            "should contain MissingRequiredAttribute"
350        );
351    }
352
353    #[test]
354    fn parse_content_recovering_keeps_partial_document_on_validation_error() {
355        let content = "\
356---
357supersigil:
358  id: test/partial
359  type: requirements
360  status: draft
361---
362
363```supersigil-xml
364<AcceptanceCriteria>
365  <Criterion>broken</Criterion>
366  <Criterion id=\"ok-1\">ok</Criterion>
367</AcceptanceCriteria>
368```
369";
370        let defs = ComponentDefs::defaults();
371        let recovered = parse_content_recovering(Path::new("test.md"), content, &defs)
372            .expect("recovering parse should return a partial document");
373
374        assert_eq!(recovered.fatal_errors.len(), 1);
375        assert!(matches!(
376            recovered.fatal_errors[0],
377            ParseError::MissingRequiredAttribute { .. }
378        ));
379
380        let ParseResult::Document(doc) = recovered.result else {
381            panic!("expected partial document");
382        };
383        assert_eq!(doc.components.len(), 1);
384        assert_eq!(doc.components[0].name, "AcceptanceCriteria");
385        assert_eq!(doc.components[0].children.len(), 2);
386        assert_eq!(doc.components[0].children[0].name, "Criterion");
387        assert_eq!(
388            doc.components[0].children[1]
389                .attributes
390                .get("id")
391                .map(String::as_str),
392            Some("ok-1")
393        );
394    }
395}