Skip to main content

nu_command/formats/from/
xml.rs

1use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLUMN_TAG_NAME};
2use indexmap::IndexMap;
3use nu_engine::command_prelude::*;
4use nu_protocol::{
5    DEFAULT_ERROR_CONTEXT, Signals, shell_error::generic::GenericError, truncated_source_window,
6};
7
8use roxmltree::{NodeType, ParsingOptions, TextPos};
9
10#[derive(Clone)]
11pub struct FromXml;
12
13impl Command for FromXml {
14    fn name(&self) -> &str {
15        "from xml"
16    }
17
18    fn signature(&self) -> Signature {
19        Signature::build("from xml")
20            .input_output_types(vec![(Type::String, Type::record())])
21            .switch("keep-comments", "Add comment nodes to result.", None)
22            .switch(
23                "allow-dtd",
24                "Allow parsing documents with DTDs (may result in exponential entity expansion).",
25                None,
26            )
27            .switch(
28                "keep-pi",
29                "Add processing instruction nodes to result.",
30                None,
31            )
32            .category(Category::Formats)
33    }
34
35    fn description(&self) -> &str {
36        "Parse text as .xml and create record."
37    }
38
39    fn extra_description(&self) -> &str {
40        r#"Every XML entry is represented via a record with tag, attribute and content fields.
41To represent different types of entries different values are written to this fields:
421. Tag entry: `{tag: <tag name> attrs: {<attr name>: "<string value>" ...} content: [<entries>]}`
432. Comment entry: `{tag: '!' attrs: null content: "<comment string>"}`
443. Processing instruction (PI): `{tag: '?<pi name>' attrs: null content: "<pi content string>"}`
454. Text: `{tag: null attrs: null content: "<text>"}`.
46
47Unlike to xml command all null values are always present and text is never represented via plain
48string. This way content of every tag is always a table and is easier to parse"#
49    }
50
51    fn run(
52        &self,
53        engine_state: &EngineState,
54        stack: &mut Stack,
55        call: &Call,
56        input: PipelineData,
57    ) -> Result<PipelineData, ShellError> {
58        let head = call.head;
59        let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
60        let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
61        let allow_dtd = call.has_flag(engine_state, stack, "allow-dtd")?;
62        let info = ParsingInfo {
63            span: head,
64            keep_comments,
65            keep_processing_instructions,
66            allow_dtd,
67        };
68        from_xml(input, &info, engine_state.signals())
69    }
70
71    fn examples(&self) -> Vec<Example<'_>> {
72        vec![Example {
73            example: r#"'<?xml version="1.0" encoding="UTF-8"?>
74<note>
75  <remember>Event</remember>
76</note>' | from xml"#,
77            description: "Converts xml formatted string to record.",
78            result: Some(Value::test_record(record! {
79                COLUMN_TAG_NAME =>     Value::test_string("note"),
80                COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
81                COLUMN_CONTENT_NAME => Value::test_list(vec![
82                Value::test_record(record! {
83                    COLUMN_TAG_NAME =>     Value::test_string("remember"),
84                    COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
85                    COLUMN_CONTENT_NAME => Value::test_list(vec![
86                    Value::test_record(record! {
87                        COLUMN_TAG_NAME =>     Value::test_nothing(),
88                        COLUMN_ATTRS_NAME =>   Value::test_nothing(),
89                        COLUMN_CONTENT_NAME => Value::test_string("Event"),
90                        })],
91                    ),
92                    })],
93                ),
94            })),
95        }]
96    }
97}
98
99struct ParsingInfo {
100    span: Span,
101    keep_comments: bool,
102    keep_processing_instructions: bool,
103    allow_dtd: bool,
104}
105
106fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
107    let mut collected = IndexMap::new();
108    for a in attributes {
109        collected.insert(String::from(a.name()), Value::string(a.value(), info.span));
110    }
111    Value::record(collected.into_iter().collect(), info.span)
112}
113
114fn element_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Value {
115    let span = info.span;
116    let mut node = IndexMap::new();
117
118    let tag = n.tag_name().name().trim().to_string();
119    let tag = Value::string(tag, span);
120
121    let content: Vec<Value> = n
122        .children()
123        .filter_map(|node| from_node_to_value(&node, info))
124        .collect();
125    let content = Value::list(content, span);
126
127    let attributes = from_attributes_to_value(&n.attributes().collect::<Vec<_>>(), info);
128
129    node.insert(String::from(COLUMN_TAG_NAME), tag);
130    node.insert(String::from(COLUMN_ATTRS_NAME), attributes);
131    node.insert(String::from(COLUMN_CONTENT_NAME), content);
132
133    Value::record(node.into_iter().collect(), span)
134}
135
136fn text_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
137    let span = info.span;
138    let text = n.text().expect("Non-text node supplied to text_to_value");
139    let text = text.trim();
140    if text.is_empty() {
141        None
142    } else {
143        let mut node = IndexMap::new();
144        let content = Value::string(String::from(text), span);
145
146        node.insert(String::from(COLUMN_TAG_NAME), Value::nothing(span));
147        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
148        node.insert(String::from(COLUMN_CONTENT_NAME), content);
149
150        Some(Value::record(node.into_iter().collect(), span))
151    }
152}
153
154fn comment_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
155    if info.keep_comments {
156        let span = info.span;
157        let text = n
158            .text()
159            .expect("Non-comment node supplied to comment_to_value");
160
161        let mut node = IndexMap::new();
162        let content = Value::string(String::from(text), span);
163
164        node.insert(String::from(COLUMN_TAG_NAME), Value::string("!", span));
165        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
166        node.insert(String::from(COLUMN_CONTENT_NAME), content);
167
168        Some(Value::record(node.into_iter().collect(), span))
169    } else {
170        None
171    }
172}
173
174fn processing_instruction_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
175    if info.keep_processing_instructions {
176        let span = info.span;
177        let pi = n.pi()?;
178
179        let mut node = IndexMap::new();
180        // Add '?' before target to differentiate tags from pi targets
181        let tag = format!("?{}", pi.target);
182        let tag = Value::string(tag, span);
183        let content = pi
184            .value
185            .map_or_else(|| Value::nothing(span), |x| Value::string(x, span));
186
187        node.insert(String::from(COLUMN_TAG_NAME), tag);
188        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
189        node.insert(String::from(COLUMN_CONTENT_NAME), content);
190
191        Some(Value::record(node.into_iter().collect(), span))
192    } else {
193        None
194    }
195}
196
197fn from_node_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
198    match n.node_type() {
199        NodeType::Element => Some(element_to_value(n, info)),
200        NodeType::Text => text_to_value(n, info),
201        NodeType::Comment => comment_to_value(n, info),
202        NodeType::PI => processing_instruction_to_value(n, info),
203        _ => None,
204    }
205}
206
207fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value {
208    element_to_value(&d.root_element(), info)
209}
210
211fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
212    let options = ParsingOptions {
213        allow_dtd: info.allow_dtd,
214        ..Default::default()
215    };
216
217    let parsed = roxmltree::Document::parse_with_options(s, options)?;
218    Ok(from_document_to_value(&parsed, info))
219}
220
221fn from_xml(
222    input: PipelineData,
223    info: &ParsingInfo,
224    signals: &Signals,
225) -> Result<PipelineData, ShellError> {
226    let (concat_string, span, metadata) = input.collect_string_strict(info.span)?;
227
228    match from_xml_string_to_value(&concat_string, info) {
229        Ok(x) => {
230            Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
231        }
232        Err(err) => Err(process_xml_parse_error(concat_string, err, span, signals)),
233    }
234}
235
236fn process_xml_parse_error(
237    source: impl AsRef<str>,
238    err: roxmltree::Error,
239    span: Span,
240    signals: &Signals,
241) -> ShellError {
242    let source = source.as_ref();
243    match err {
244        roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_err(
245            source,
246            span,
247            signals,
248            "The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
249            pos,
250        ),
251        roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_err(
252            source,
253            span,
254            signals,
255            "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace  URI.",
256            pos,
257        ),
258        roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_err(
259            source,
260            span,
261            signals,
262            "The http://www.w3.org/2000/xmlns/  URI must not be declared.",
263            pos,
264        ),
265        roxmltree::Error::InvalidElementNamePrefix(pos) => make_xml_err(
266            source,
267            span,
268            signals,
269            "xmlns can't be used as an element prefix.",
270            pos,
271        ),
272        roxmltree::Error::DuplicatedNamespace(namespace, pos) => make_xml_err(
273            source,
274            span,
275            signals,
276            format!("Namespace {namespace} was already defined on this element."),
277            pos,
278        ),
279        roxmltree::Error::UnknownNamespace(prefix, pos) => make_xml_err(
280            source,
281            span,
282            signals,
283            format!("Unknown prefix {prefix}"),
284            pos,
285        ),
286        roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => make_xml_err(
287            source,
288            span,
289            signals,
290            format!("Unexpected close tag {actual}, expected {expected}"),
291            pos,
292        ),
293        roxmltree::Error::UnexpectedEntityCloseTag(pos) => make_xml_err(
294            source,
295            span,
296            signals,
297            "Entity value starts with a close tag.",
298            pos,
299        ),
300        roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_err(
301            source,
302            span,
303            signals,
304            format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
305            pos,
306        ),
307        roxmltree::Error::MalformedEntityReference(pos) => {
308            make_xml_err(source, span, signals, "Malformed entity reference.", pos)
309        }
310        roxmltree::Error::EntityReferenceLoop(pos) => make_xml_err(
311            source,
312            span,
313            signals,
314            "Possible entity reference loop.",
315            pos,
316        ),
317        roxmltree::Error::InvalidAttributeValue(pos) => make_xml_err(
318            source,
319            span,
320            signals,
321            "Attribute value cannot have a < character.",
322            pos,
323        ),
324        roxmltree::Error::DuplicatedAttribute(attribute, pos) => make_xml_err(
325            source,
326            span,
327            signals,
328            format!("Element has a duplicated attribute: {attribute}"),
329            pos,
330        ),
331        roxmltree::Error::NoRootNode => {
332            make_xml_error("The XML document must have at least one element.", span)
333        }
334        roxmltree::Error::UnclosedRootNode => {
335            make_xml_error("The root node was opened but never closed.", span)
336        }
337        roxmltree::Error::DtdDetected => make_xml_error(
338            "XML document with DTD detected.\nDTDs are disabled by default to prevent denial-of-service attacks (use `from xml --allow-dtd` to bypass this functionality)",
339            span,
340        ),
341        roxmltree::Error::NodesLimitReached => make_xml_error("Node limit was reached.", span),
342        roxmltree::Error::AttributesLimitReached => make_xml_error("Attribute limit reached", span),
343        roxmltree::Error::NamespacesLimitReached => make_xml_error("Namespace limit reached", span),
344        roxmltree::Error::UnexpectedDeclaration(pos) => make_xml_err(
345            source,
346            span,
347            signals,
348            "An XML document can have only one XML declaration and it must be at the start of the document.",
349            pos,
350        ),
351        roxmltree::Error::InvalidName(pos) => {
352            make_xml_err(source, span, signals, "Invalid name.", pos)
353        }
354        roxmltree::Error::NonXmlChar(_, pos) => make_xml_err(
355            source,
356            span,
357            signals,
358            "Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>",
359            pos,
360        ),
361        roxmltree::Error::InvalidChar(expected, actual, pos) => make_xml_err(
362            source,
363            span,
364            signals,
365            format!(
366                "Unexpected character {}, expected {}",
367                actual as char, expected as char
368            ),
369            pos,
370        ),
371        roxmltree::Error::InvalidChar2(expected, actual, pos) => make_xml_err(
372            source,
373            span,
374            signals,
375            format!(
376                "Unexpected character {}, expected {}",
377                actual as char, expected
378            ),
379            pos,
380        ),
381        roxmltree::Error::InvalidString(_, pos) => make_xml_err(
382            source,
383            span,
384            signals,
385            "Invalid/unexpected string in XML.",
386            pos,
387        ),
388        roxmltree::Error::InvalidExternalID(pos) => {
389            make_xml_err(source, span, signals, "Invalid ExternalID in the DTD.", pos)
390        }
391        roxmltree::Error::EntityResolver(pos, msg) => make_xml_err(
392            source,
393            span,
394            signals,
395            format!("Resolving the given entity yielded an error: {msg}."),
396            pos,
397        ),
398        roxmltree::Error::InvalidComment(pos) => make_xml_err(
399            source,
400            span,
401            signals,
402            "A comment cannot contain `--` or end with `-`.",
403            pos,
404        ),
405        roxmltree::Error::InvalidCharacterData(pos) => make_xml_err(
406            source,
407            span,
408            signals,
409            "Character Data node contains an invalid data. Currently, only `]]>` is not allowed.",
410            pos,
411        ),
412        roxmltree::Error::UnknownToken(pos) => {
413            make_xml_err(source, span, signals, "Unknown token in XML.", pos)
414        }
415        roxmltree::Error::UnexpectedEndOfStream => {
416            make_xml_error("Unexpected end of stream while parsing XML.", span)
417        }
418    }
419}
420
421fn make_xml_err(
422    source: &str,
423    span: Span,
424    signals: &Signals,
425    msg: impl Into<String>,
426    pos: TextPos,
427) -> ShellError {
428    match Span::try_from_row_column(pos.row as usize, pos.col as usize, source, &span, signals) {
429        Ok(byte_span) => {
430            let (src, label_span) =
431                truncated_source_window(source, byte_span, DEFAULT_ERROR_CONTEXT);
432            ShellError::OutsideSpannedLabeledError {
433                src,
434                error: "Failed to parse XML".into(),
435                msg: msg.into(),
436                span: label_span,
437            }
438        }
439        Err(e) => e,
440    }
441}
442
443fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
444    ShellError::Generic(GenericError::new("Failed to parse XML", msg.into(), span))
445}
446
447#[cfg(test)]
448mod tests {
449    use crate::Metadata;
450    use crate::MetadataSet;
451    use crate::Reject;
452    use roxmltree::ParsingOptions;
453
454    use super::*;
455
456    use indexmap::IndexMap;
457    use indexmap::indexmap;
458    use nu_cmd_lang::eval_pipeline_without_terminal_expression;
459
460    fn string(input: impl Into<String>) -> Value {
461        Value::test_string(input)
462    }
463
464    fn attributes(entries: IndexMap<&str, &str>) -> Value {
465        Value::test_record(
466            entries
467                .into_iter()
468                .map(|(k, v)| (k.into(), string(v)))
469                .collect(),
470        )
471    }
472
473    fn table(list: &[Value]) -> Value {
474        Value::list(list.to_vec(), Span::test_data())
475    }
476
477    fn content_tag(
478        tag: impl Into<String>,
479        attrs: IndexMap<&str, &str>,
480        content: &[Value],
481    ) -> Value {
482        Value::test_record(record! {
483            COLUMN_TAG_NAME =>     string(tag),
484            COLUMN_ATTRS_NAME =>   attributes(attrs),
485            COLUMN_CONTENT_NAME => table(content),
486        })
487    }
488
489    fn content_string(value: impl Into<String>) -> Value {
490        Value::test_record(record! {
491            COLUMN_TAG_NAME =>     Value::nothing(Span::test_data()),
492            COLUMN_ATTRS_NAME =>   Value::nothing(Span::test_data()),
493            COLUMN_CONTENT_NAME => string(value),
494        })
495    }
496
497    fn parse(xml: &str) -> Result<Value, roxmltree::Error> {
498        let info = ParsingInfo {
499            span: Span::test_data(),
500            keep_comments: false,
501            keep_processing_instructions: false,
502            allow_dtd: false,
503        };
504        from_xml_string_to_value(xml, &info)
505    }
506
507    #[test]
508    fn parses_empty_element() -> Result<(), roxmltree::Error> {
509        let source = "<nu></nu>";
510
511        assert_eq!(parse(source)?, content_tag("nu", indexmap! {}, &[]));
512
513        Ok(())
514    }
515
516    #[test]
517    fn parses_element_with_text() -> Result<(), roxmltree::Error> {
518        let source = "<nu>La era de los tres caballeros</nu>";
519
520        assert_eq!(
521            parse(source)?,
522            content_tag(
523                "nu",
524                indexmap! {},
525                &[content_string("La era de los tres caballeros")]
526            )
527        );
528
529        Ok(())
530    }
531
532    #[test]
533    fn parses_element_with_elements() -> Result<(), roxmltree::Error> {
534        let source = "\
535<nu>
536    <dev>Andrés</dev>
537    <dev>JT</dev>
538    <dev>Yehuda</dev>
539</nu>";
540
541        assert_eq!(
542            parse(source)?,
543            content_tag(
544                "nu",
545                indexmap! {},
546                &[
547                    content_tag("dev", indexmap! {}, &[content_string("Andrés")]),
548                    content_tag("dev", indexmap! {}, &[content_string("JT")]),
549                    content_tag("dev", indexmap! {}, &[content_string("Yehuda")])
550                ]
551            )
552        );
553
554        Ok(())
555    }
556
557    #[test]
558    fn parses_element_with_attribute() -> Result<(), roxmltree::Error> {
559        let source = "\
560<nu version=\"2.0\">
561</nu>";
562
563        assert_eq!(
564            parse(source)?,
565            content_tag("nu", indexmap! {"version" => "2.0"}, &[])
566        );
567
568        Ok(())
569    }
570
571    #[test]
572    fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> {
573        let source = "\
574<nu version=\"2.0\">
575    <version>2.0</version>
576</nu>";
577
578        assert_eq!(
579            parse(source)?,
580            content_tag(
581                "nu",
582                indexmap! {"version" => "2.0"},
583                &[content_tag(
584                    "version",
585                    indexmap! {},
586                    &[content_string("2.0")]
587                )]
588            )
589        );
590
591        Ok(())
592    }
593
594    #[test]
595    fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> {
596        let source = "\
597<nu version=\"2.0\" age=\"25\">
598</nu>";
599
600        assert_eq!(
601            parse(source)?,
602            content_tag("nu", indexmap! {"version" => "2.0", "age" => "25"}, &[])
603        );
604
605        Ok(())
606    }
607
608    #[test]
609    fn test_examples() -> nu_test_support::Result {
610        nu_test_support::test().examples(FromXml)
611    }
612
613    #[test]
614    fn test_content_type_metadata() {
615        let mut engine_state = Box::new(EngineState::new());
616        let delta = {
617            let mut working_set = StateWorkingSet::new(&engine_state);
618
619            working_set.add_decl(Box::new(FromXml {}));
620            working_set.add_decl(Box::new(Metadata {}));
621            working_set.add_decl(Box::new(MetadataSet {}));
622            working_set.add_decl(Box::new(Reject {}));
623
624            working_set.render()
625        };
626
627        engine_state
628            .merge_delta(delta)
629            .expect("Error merging delta");
630
631        let cmd = r#"'<?xml version="1.0" encoding="UTF-8"?>
632<note>
633  <remember>Event</remember>
634</note>' | metadata set --content-type 'application/xml' --path-columns [name] | from xml | metadata | reject span | $in"#;
635        let result = eval_pipeline_without_terminal_expression(
636            cmd,
637            std::env::temp_dir().as_ref(),
638            &mut engine_state,
639        );
640        assert_eq!(
641            Value::test_record(
642                record!("path_columns" => Value::test_list(vec![Value::test_string("name")]))
643            ),
644            result.expect("There should be a result")
645        )
646    }
647
648    #[test]
649    fn xml_error_source_is_bounded() {
650        // Build a large valid XML with an error near the end
651        let mut input = String::from("<root>");
652        for _ in 0..5000 {
653            input.push_str("<item>value</item>");
654        }
655        input.push_str("<bad"); // Unclosed tag at the end (error)
656
657        let signals = Signals::empty();
658        let parse_result = roxmltree::Document::parse_with_options(
659            &input,
660            ParsingOptions {
661                allow_dtd: true,
662                ..Default::default()
663            },
664        );
665        assert!(parse_result.is_err(), "should fail to parse");
666
667        let err = process_xml_parse_error(
668            &input,
669            parse_result.unwrap_err(),
670            Span::test_data(),
671            &signals,
672        );
673        match &err {
674            ShellError::OutsideSpannedLabeledError { src, .. } => {
675                assert!(
676                    src.len() < 20_000,
677                    "error source should be bounded, got {} bytes",
678                    src.len()
679                );
680            }
681            ShellError::Generic(_) => (), // Generic errors without source are also OK
682            other => panic!("expected OutsideSpannedLabeledError or Generic, got {other:?}"),
683        }
684    }
685
686    #[test]
687    fn xml_parse_success_not_affected() {
688        let result = from_xml_string_to_value(
689            r#"<?xml version="1.0"?><root><item>value</item></root>"#,
690            &ParsingInfo {
691                span: Span::test_data(),
692                keep_comments: false,
693                keep_processing_instructions: false,
694                allow_dtd: false,
695            },
696        );
697        assert!(result.is_ok(), "valid XML should still parse");
698    }
699}