Skip to main content

supersigil_parser/
xml_extract.rs

1//! Component extraction from parsed XML nodes.
2//!
3//! Transforms `XmlNode` values (produced by [`crate::xml_parser`]) into
4//! [`ExtractedComponent`] values using the same `ComponentDefs`-based
5//! filtering as the previous extraction pipeline.
6//!
7//! **Key behaviors:**
8//! - Only known `PascalCase` elements (those in `ComponentDefs`) become components.
9//! - Unknown `PascalCase` elements are transparent wrappers — their children are
10//!   still traversed.
11//! - Lowercase elements are ignored (but their children are traversed).
12//! - Attributes are stored as `HashMap<String, String>` (raw strings).
13//! - `body_text` is computed from direct `Text` children, trimmed, `None` if empty.
14//! - `code_blocks` is always empty in the current Markdown + XML format.
15//! - Nested child components are collected recursively.
16
17use std::collections::HashMap;
18
19use supersigil_core::{ComponentDefs, ExtractedComponent, SourcePosition};
20
21use crate::util::{is_pascal_case, line_col};
22use crate::xml_parser::XmlNode;
23
24/// Collect body text from the direct children of an XML element.
25///
26/// Concatenates `Text` node values, recursing into non-component wrapper
27/// elements (unknown `PascalCase` and lowercase). Known component children
28/// are excluded from the body text.
29/// Returns `(text, start_offset, end_offset)` where `text` is `None` if no text
30/// was found or the result is empty after trimming, `start_offset` is the byte
31/// offset of the first contributing text node, and `end_offset` is the raw source
32/// byte offset of the end of the last contributing text node.
33fn collect_body_text(
34    children: &[XmlNode],
35    defs: &ComponentDefs,
36) -> (Option<String>, Option<usize>, Option<usize>) {
37    let mut buf = String::new();
38    let mut first_offset: Option<usize> = None;
39    let mut last_end_offset: Option<usize> = None;
40    collect_text_recursive(
41        &mut buf,
42        &mut first_offset,
43        &mut last_end_offset,
44        children,
45        defs,
46    );
47    let trimmed = buf.trim();
48    if trimmed.is_empty() {
49        (None, None, None)
50    } else {
51        // Adjust offset to account for leading whitespace trimmed from the text.
52        let leading_ws = buf.len() - buf.trim_start().len();
53        let offset = first_offset.map(|o| o + leading_ws);
54        // Adjust end offset to account for trailing whitespace trimmed from the text.
55        let trailing_ws = buf.len() - buf.trim_end().len();
56        let end_offset = last_end_offset.map(|o| o - trailing_ws);
57        (Some(trimmed.to_owned()), offset, end_offset)
58    }
59}
60
61/// Recursively collect text values, skipping known component nodes.
62fn collect_text_recursive(
63    buf: &mut String,
64    first_offset: &mut Option<usize>,
65    last_end_offset: &mut Option<usize>,
66    nodes: &[XmlNode],
67    defs: &ComponentDefs,
68) {
69    for node in nodes {
70        match node {
71            XmlNode::Text {
72                content,
73                offset,
74                end_offset,
75            } => {
76                if first_offset.is_none() {
77                    *first_offset = Some(*offset);
78                }
79                *last_end_offset = Some(*end_offset);
80                buf.push_str(content);
81            }
82            XmlNode::Element { name, children, .. } => {
83                // Known components are child components — their text is excluded.
84                if defs.is_known(name) {
85                    continue;
86                }
87                // Unknown PascalCase or lowercase elements are transparent
88                // wrappers — recurse into their children.
89                collect_text_recursive(buf, first_offset, last_end_offset, children, defs);
90            }
91        }
92    }
93}
94
95/// Convert a `Vec<(String, String)>` attribute list into a `HashMap`.
96fn attributes_to_map(attrs: &[(String, String)]) -> HashMap<String, String> {
97    attrs.iter().cloned().collect()
98}
99
100// ---------------------------------------------------------------------------
101// Extraction context
102// ---------------------------------------------------------------------------
103
104/// Shared context threaded through the recursive extraction pipeline.
105struct ExtractionCtx<'a> {
106    /// The full normalized file content, for line/column computation.
107    content: &'a str,
108    defs: &'a ComponentDefs,
109}
110
111// ---------------------------------------------------------------------------
112// Public entry point
113// ---------------------------------------------------------------------------
114
115/// Walk parsed XML nodes and extract known components as [`ExtractedComponent`]
116/// values.
117///
118/// `nodes` are the top-level `XmlNode` values from [`crate::parse_supersigil_xml`].
119/// `content` is the full normalized file content (for line/column computation).
120/// `component_defs` defines which `PascalCase` element names are known components.
121#[must_use]
122pub fn extract_components_from_xml(
123    nodes: &[XmlNode],
124    content: &str,
125    component_defs: &ComponentDefs,
126) -> Vec<ExtractedComponent> {
127    let ctx = ExtractionCtx {
128        content,
129        defs: component_defs,
130    };
131    let mut components = Vec::new();
132    collect_from_nodes(nodes, &ctx, &mut components);
133    components
134}
135
136// ---------------------------------------------------------------------------
137// Recursive helpers
138// ---------------------------------------------------------------------------
139
140/// Process a list of XML nodes, collecting known components into `out`.
141fn collect_from_nodes(
142    nodes: &[XmlNode],
143    ctx: &ExtractionCtx<'_>,
144    out: &mut Vec<ExtractedComponent>,
145) {
146    for node in nodes {
147        collect_component(node, ctx, out);
148    }
149}
150
151/// Process a single XML node. If it's a known component element, extract it;
152/// otherwise recurse into children looking for nested components.
153fn collect_component(node: &XmlNode, ctx: &ExtractionCtx<'_>, out: &mut Vec<ExtractedComponent>) {
154    match node {
155        XmlNode::Text { .. } => {}
156
157        XmlNode::Element {
158            name,
159            attributes,
160            children,
161            offset,
162            end_offset,
163        } => {
164            if !is_pascal_case(name) {
165                collect_from_nodes(children, ctx, out);
166                return;
167            }
168
169            if !ctx.defs.is_known(name) {
170                collect_from_nodes(children, ctx, out);
171                return;
172            }
173
174            // Known component — extract it.
175            let (line, column) = line_col(ctx.content, *offset);
176            let position = SourcePosition {
177                byte_offset: *offset,
178                line,
179                column,
180            };
181
182            let (end_line, end_column) = line_col(ctx.content, *end_offset);
183            let end_position = SourcePosition {
184                byte_offset: *end_offset,
185                line: end_line,
186                column: end_column,
187            };
188
189            let attrs = attributes_to_map(attributes);
190
191            let mut child_components = Vec::new();
192            collect_from_nodes(children, ctx, &mut child_components);
193
194            let (body_text, body_text_offset, body_text_end_offset) =
195                collect_body_text(children, ctx.defs);
196
197            out.push(ExtractedComponent {
198                name: name.clone(),
199                attributes: attrs,
200                children: child_components,
201                body_text,
202                body_text_offset,
203                body_text_end_offset,
204                code_blocks: Vec::new(),
205                position,
206                end_position,
207            });
208        }
209    }
210}
211
212// ---------------------------------------------------------------------------
213// Tests
214// ---------------------------------------------------------------------------
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219
220    fn extract(nodes: &[XmlNode], content: &str, defs: &ComponentDefs) -> Vec<ExtractedComponent> {
221        extract_components_from_xml(nodes, content, defs)
222    }
223
224    /// Helper to build a `XmlNode::Text` for tests (offset 0, `end_offset` = len).
225    fn text(s: &str) -> XmlNode {
226        XmlNode::Text {
227            content: s.into(),
228            offset: 0,
229            end_offset: s.len(),
230        }
231    }
232
233    // -- Known component extraction ----------------------------------------
234
235    #[test]
236    fn extracts_known_component() {
237        let defs = ComponentDefs::defaults();
238        let content = "0123456789<Criterion id=\"c1\">Some text</Criterion>";
239        let nodes = vec![XmlNode::Element {
240            name: "Criterion".into(),
241            attributes: vec![("id".into(), "c1".into())],
242            children: vec![text("Some text")],
243            offset: 10,
244            end_offset: content.len(),
245        }];
246
247        let result = extract(&nodes, content, &defs);
248        assert_eq!(result.len(), 1);
249        assert_eq!(result[0].name, "Criterion");
250        assert_eq!(result[0].attributes["id"], "c1");
251        assert_eq!(result[0].body_text.as_deref(), Some("Some text"));
252        assert_eq!(result[0].position.byte_offset, 10);
253        assert_eq!(result[0].end_position.byte_offset, content.len());
254        assert_eq!(result[0].end_position.line, 1);
255        assert_eq!(result[0].end_position.column, content.len() + 1);
256    }
257
258    #[test]
259    fn extracts_multiple_top_level_components() {
260        let defs = ComponentDefs::defaults();
261        let nodes = vec![
262            XmlNode::Element {
263                name: "Criterion".into(),
264                attributes: vec![("id".into(), "c1".into())],
265                children: vec![text("text")],
266                offset: 0,
267                end_offset: 0,
268            },
269            XmlNode::Element {
270                name: "VerifiedBy".into(),
271                attributes: vec![("refs".into(), "c1".into())],
272                children: vec![],
273                offset: 50,
274                end_offset: 0,
275            },
276        ];
277        let content = &"x".repeat(100);
278
279        let result = extract(&nodes, content, &defs);
280        assert_eq!(result.len(), 2);
281        assert_eq!(result[0].name, "Criterion");
282        assert_eq!(result[1].name, "VerifiedBy");
283    }
284
285    // -- Unknown PascalCase transparency -----------------------------------
286
287    #[test]
288    fn unknown_pascal_case_is_transparent_wrapper() {
289        let defs = ComponentDefs::defaults();
290        // <Aside> is not a known component — children should be traversed.
291        let nodes = vec![XmlNode::Element {
292            name: "Aside".into(),
293            attributes: vec![],
294            children: vec![XmlNode::Element {
295                name: "Criterion".into(),
296                attributes: vec![("id".into(), "c1".into())],
297                children: vec![],
298                offset: 20,
299                end_offset: 0,
300            }],
301            offset: 0,
302            end_offset: 0,
303        }];
304        let content = &"x".repeat(100);
305
306        let result = extract(&nodes, content, &defs);
307        // Aside should NOT appear; Criterion should.
308        assert_eq!(result.len(), 1);
309        assert_eq!(result[0].name, "Criterion");
310    }
311
312    #[test]
313    fn deeply_nested_unknown_wrappers_are_transparent() {
314        let defs = ComponentDefs::defaults();
315        let nodes = vec![XmlNode::Element {
316            name: "Wrapper".into(),
317            attributes: vec![],
318            children: vec![XmlNode::Element {
319                name: "Inner".into(),
320                attributes: vec![],
321                children: vec![XmlNode::Element {
322                    name: "Criterion".into(),
323                    attributes: vec![("id".into(), "deep".into())],
324                    children: vec![],
325                    offset: 40,
326                    end_offset: 0,
327                }],
328                offset: 20,
329                end_offset: 0,
330            }],
331            offset: 0,
332            end_offset: 0,
333        }];
334        let content = &"x".repeat(100);
335
336        let result = extract(&nodes, content, &defs);
337        assert_eq!(result.len(), 1);
338        assert_eq!(result[0].name, "Criterion");
339        assert_eq!(result[0].attributes["id"], "deep");
340    }
341
342    // -- Lowercase element ignoring ----------------------------------------
343
344    #[test]
345    fn lowercase_elements_are_ignored() {
346        let defs = ComponentDefs::defaults();
347        let nodes = vec![XmlNode::Element {
348            name: "div".into(),
349            attributes: vec![],
350            children: vec![XmlNode::Element {
351                name: "Criterion".into(),
352                attributes: vec![("id".into(), "c1".into())],
353                children: vec![],
354                offset: 10,
355                end_offset: 0,
356            }],
357            offset: 0,
358            end_offset: 0,
359        }];
360        let content = &"x".repeat(100);
361
362        let result = extract(&nodes, content, &defs);
363        // div should not appear, but its child Criterion should.
364        assert_eq!(result.len(), 1);
365        assert_eq!(result[0].name, "Criterion");
366    }
367
368    // -- Attribute extraction ----------------------------------------------
369
370    #[test]
371    fn attributes_stored_as_raw_strings() {
372        let defs = ComponentDefs::defaults();
373        let nodes = vec![XmlNode::Element {
374            name: "Criterion".into(),
375            attributes: vec![
376                ("id".into(), "c1".into()),
377                ("strategy".into(), "tag".into()),
378            ],
379            children: vec![],
380            offset: 0,
381            end_offset: 0,
382        }];
383        let content = &"x".repeat(100);
384
385        let result = extract(&nodes, content, &defs);
386        assert_eq!(result[0].attributes.len(), 2);
387        assert_eq!(result[0].attributes["id"], "c1");
388        assert_eq!(result[0].attributes["strategy"], "tag");
389    }
390
391    #[test]
392    fn self_closing_element_has_empty_children_and_no_body_text() {
393        let defs = ComponentDefs::defaults();
394        let nodes = vec![XmlNode::Element {
395            name: "VerifiedBy".into(),
396            attributes: vec![("refs".into(), "c1".into())],
397            children: vec![],
398            offset: 0,
399            end_offset: 0,
400        }];
401        let content = &"x".repeat(100);
402
403        let result = extract(&nodes, content, &defs);
404        assert_eq!(result.len(), 1);
405        assert!(result[0].children.is_empty());
406        assert_eq!(result[0].body_text, None);
407    }
408
409    // -- Body text computation ---------------------------------------------
410
411    #[test]
412    fn body_text_from_text_children() {
413        let defs = ComponentDefs::defaults();
414        let nodes = vec![XmlNode::Element {
415            name: "Criterion".into(),
416            attributes: vec![("id".into(), "c1".into())],
417            children: vec![text("\n  The system shall do something.\n")],
418            offset: 0,
419            end_offset: 0,
420        }];
421        let content = &"x".repeat(100);
422
423        let result = extract(&nodes, content, &defs);
424        assert_eq!(
425            result[0].body_text.as_deref(),
426            Some("The system shall do something.")
427        );
428    }
429
430    #[test]
431    fn body_text_none_for_whitespace_only() {
432        let defs = ComponentDefs::defaults();
433        let nodes = vec![XmlNode::Element {
434            name: "Criterion".into(),
435            attributes: vec![("id".into(), "c1".into())],
436            children: vec![text("   \n  \n  ")],
437            offset: 0,
438            end_offset: 0,
439        }];
440        let content = &"x".repeat(100);
441
442        let result = extract(&nodes, content, &defs);
443        assert_eq!(result[0].body_text, None);
444    }
445
446    #[test]
447    fn body_text_excludes_known_child_components() {
448        let defs = ComponentDefs::defaults();
449        let nodes = vec![XmlNode::Element {
450            name: "AcceptanceCriteria".into(),
451            attributes: vec![],
452            children: vec![
453                text("Parent text"),
454                XmlNode::Element {
455                    name: "Criterion".into(),
456                    attributes: vec![("id".into(), "c1".into())],
457                    children: vec![text("Child text")],
458                    offset: 30,
459                    end_offset: 0,
460                },
461            ],
462            offset: 0,
463            end_offset: 0,
464        }];
465        let content = &"x".repeat(100);
466
467        let result = extract(&nodes, content, &defs);
468        assert_eq!(result.len(), 1);
469        assert_eq!(result[0].name, "AcceptanceCriteria");
470        // Body text should contain "Parent text" but NOT "Child text"
471        assert_eq!(result[0].body_text.as_deref(), Some("Parent text"));
472    }
473
474    #[test]
475    fn body_text_includes_text_from_unknown_wrapper() {
476        let defs = ComponentDefs::defaults();
477        let nodes = vec![XmlNode::Element {
478            name: "Criterion".into(),
479            attributes: vec![("id".into(), "c1".into())],
480            children: vec![XmlNode::Element {
481                name: "Emphasis".into(),
482                attributes: vec![],
483                children: vec![text("important")],
484                offset: 20,
485                end_offset: 0,
486            }],
487            offset: 0,
488            end_offset: 0,
489        }];
490        let content = &"x".repeat(100);
491
492        let result = extract(&nodes, content, &defs);
493        // Emphasis is unknown PascalCase — transparent for body text.
494        assert_eq!(result[0].body_text.as_deref(), Some("important"));
495    }
496
497    // -- Nested children ---------------------------------------------------
498
499    #[test]
500    fn nested_child_components_collected() {
501        let defs = ComponentDefs::defaults();
502        let nodes = vec![XmlNode::Element {
503            name: "AcceptanceCriteria".into(),
504            attributes: vec![],
505            children: vec![
506                XmlNode::Element {
507                    name: "Criterion".into(),
508                    attributes: vec![("id".into(), "c1".into())],
509                    children: vec![text("First")],
510                    offset: 20,
511                    end_offset: 0,
512                },
513                XmlNode::Element {
514                    name: "Criterion".into(),
515                    attributes: vec![("id".into(), "c2".into())],
516                    children: vec![text("Second")],
517                    offset: 60,
518                    end_offset: 0,
519                },
520            ],
521            offset: 0,
522            end_offset: 0,
523        }];
524        let content = &"x".repeat(100);
525
526        let result = extract(&nodes, content, &defs);
527        assert_eq!(result.len(), 1);
528        assert_eq!(result[0].name, "AcceptanceCriteria");
529        assert_eq!(result[0].children.len(), 2);
530        assert_eq!(result[0].children[0].name, "Criterion");
531        assert_eq!(result[0].children[0].attributes["id"], "c1");
532        assert_eq!(result[0].children[0].body_text.as_deref(), Some("First"));
533        assert_eq!(result[0].children[1].name, "Criterion");
534        assert_eq!(result[0].children[1].attributes["id"], "c2");
535        assert_eq!(result[0].children[1].body_text.as_deref(), Some("Second"));
536    }
537
538    // -- Position computation ----------------------------------------------
539
540    #[test]
541    fn position_computed_from_byte_offset() {
542        let defs = ComponentDefs::defaults();
543        // Content: "line1\nline2\n<Criterion>" — offset 12 is line 3, column 1.
544        let content = "line1\nline2\n<Criterion id=\"c1\" />";
545        let nodes = vec![XmlNode::Element {
546            name: "Criterion".into(),
547            attributes: vec![("id".into(), "c1".into())],
548            children: vec![],
549            offset: 12,
550            end_offset: 0,
551        }];
552
553        let result = extract(&nodes, content, &defs);
554        assert_eq!(result[0].position.byte_offset, 12);
555        assert_eq!(result[0].position.line, 3);
556        assert_eq!(result[0].position.column, 1);
557    }
558
559    #[test]
560    fn position_mid_line() {
561        let defs = ComponentDefs::defaults();
562        // Offset 7 in "abcdef\n  <Cr" is line 2, column 3.
563        let content = "abcdef\n  <Criterion />";
564        let nodes = vec![XmlNode::Element {
565            name: "Criterion".into(),
566            attributes: vec![("id".into(), "c1".into())],
567            children: vec![],
568            offset: 9, // "abcdef\n  " = 7 + 2 = 9
569            end_offset: 0,
570        }];
571
572        let result = extract(&nodes, content, &defs);
573        assert_eq!(result[0].position.byte_offset, 9);
574        assert_eq!(result[0].position.line, 2);
575        assert_eq!(result[0].position.column, 3);
576    }
577
578    // -- Empty input -------------------------------------------------------
579
580    #[test]
581    fn empty_nodes_produces_empty_result() {
582        let defs = ComponentDefs::defaults();
583        let result = extract(&[], "", &defs);
584        assert!(result.is_empty());
585    }
586
587    // -- Text-only nodes at top level --------------------------------------
588
589    #[test]
590    fn text_only_nodes_produce_no_components() {
591        let defs = ComponentDefs::defaults();
592        let nodes = vec![text("just some text")];
593        let result = extract(&nodes, "just some text", &defs);
594        assert!(result.is_empty());
595    }
596
597    // -- Realistic example -------------------------------------------------
598
599    #[test]
600    fn realistic_spec_extraction() {
601        let defs = ComponentDefs::defaults();
602        let content = r#"---
603supersigil:
604  id: test-spec
605---
606
607```supersigil-xml
608<AcceptanceCriteria>
609  <Criterion id="perf-latency" strategy="tag">
610    P99 latency must be under 100ms for API requests.
611  </Criterion>
612</AcceptanceCriteria>
613<VerifiedBy refs="perf-latency" />
614```
615"#;
616        // Simulate offsets as if the XML parser produced them.
617        // The actual byte offsets would be computed by the XML parser;
618        // here we just test the extraction logic.
619        let nodes = vec![
620            XmlNode::Element {
621                name: "AcceptanceCriteria".into(),
622                attributes: vec![],
623                children: vec![XmlNode::Element {
624                    name: "Criterion".into(),
625                    attributes: vec![
626                        ("id".into(), "perf-latency".into()),
627                        ("strategy".into(), "tag".into()),
628                    ],
629                    children: vec![text(
630                        "\n    P99 latency must be under 100ms for API requests.\n  ",
631                    )],
632                    offset: 70,
633                    end_offset: 0,
634                }],
635                offset: 50,
636                end_offset: 0,
637            },
638            XmlNode::Element {
639                name: "VerifiedBy".into(),
640                attributes: vec![("refs".into(), "perf-latency".into())],
641                children: vec![],
642                offset: 160,
643                end_offset: 0,
644            },
645        ];
646
647        let result = extract(&nodes, content, &defs);
648        assert_eq!(result.len(), 2);
649
650        // AcceptanceCriteria
651        assert_eq!(result[0].name, "AcceptanceCriteria");
652        assert!(result[0].attributes.is_empty());
653        assert_eq!(result[0].children.len(), 1);
654
655        // Nested Criterion
656        let criterion = &result[0].children[0];
657        assert_eq!(criterion.name, "Criterion");
658        assert_eq!(criterion.attributes["id"], "perf-latency");
659        assert_eq!(criterion.attributes["strategy"], "tag");
660        assert_eq!(
661            criterion.body_text.as_deref(),
662            Some("P99 latency must be under 100ms for API requests.")
663        );
664
665        // VerifiedBy
666        assert_eq!(result[1].name, "VerifiedBy");
667        assert_eq!(result[1].attributes["refs"], "perf-latency");
668        assert_eq!(result[1].body_text, None);
669        assert!(result[1].children.is_empty());
670    }
671
672    // -- Direct public API call -------------------------------------------
673
674    #[test]
675    fn public_api_extracts_components() {
676        let defs = ComponentDefs::defaults();
677        let nodes = vec![XmlNode::Element {
678            name: "Criterion".into(),
679            attributes: vec![("id".into(), "c1".into())],
680            children: vec![],
681            offset: 0,
682            end_offset: 0,
683        }];
684
685        let result = extract_components_from_xml(&nodes, "x", &defs);
686
687        assert_eq!(result.len(), 1);
688    }
689}