nu_command/formats/from/
xml.rs

1use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLUMN_TAG_NAME};
2use indexmap::IndexMap;
3use nu_engine::command_prelude::*;
4
5use roxmltree::{NodeType, ParsingOptions, TextPos};
6
7#[derive(Clone)]
8pub struct FromXml;
9
10impl Command for FromXml {
11    fn name(&self) -> &str {
12        "from xml"
13    }
14
15    fn signature(&self) -> Signature {
16        Signature::build("from xml")
17            .input_output_types(vec![(Type::String, Type::record())])
18            .switch("keep-comments", "add comment nodes to result", None)
19            .switch(
20                "allow-dtd",
21                "allow parsing documents with DTDs (may result in exponential entity expansion)",
22                None,
23            )
24            .switch(
25                "keep-pi",
26                "add processing instruction nodes to result",
27                None,
28            )
29            .category(Category::Formats)
30    }
31
32    fn description(&self) -> &str {
33        "Parse text as .xml and create record."
34    }
35
36    fn extra_description(&self) -> &str {
37        r#"Every XML entry is represented via a record with tag, attribute and content fields.
38To represent different types of entries different values are written to this fields:
391. Tag entry: `{tag: <tag name> attrs: {<attr name>: "<string value>" ...} content: [<entries>]}`
402. Comment entry: `{tag: '!' attrs: null content: "<comment string>"}`
413. Processing instruction (PI): `{tag: '?<pi name>' attrs: null content: "<pi content string>"}`
424. Text: `{tag: null attrs: null content: "<text>"}`.
43
44Unlike to xml command all null values are always present and text is never represented via plain
45string. This way content of every tag is always a table and is easier to parse"#
46    }
47
48    fn run(
49        &self,
50        engine_state: &EngineState,
51        stack: &mut Stack,
52        call: &Call,
53        input: PipelineData,
54    ) -> Result<PipelineData, ShellError> {
55        let head = call.head;
56        let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
57        let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
58        let allow_dtd = call.has_flag(engine_state, stack, "allow-dtd")?;
59        let info = ParsingInfo {
60            span: head,
61            keep_comments,
62            keep_processing_instructions,
63            allow_dtd,
64        };
65        from_xml(input, &info)
66    }
67
68    fn examples(&self) -> Vec<Example> {
69        vec![Example {
70            example: r#"'<?xml version="1.0" encoding="UTF-8"?>
71<note>
72  <remember>Event</remember>
73</note>' | from xml"#,
74            description: "Converts xml formatted string to record",
75            result: Some(Value::test_record(record! {
76                COLUMN_TAG_NAME =>     Value::test_string("note"),
77                COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
78                COLUMN_CONTENT_NAME => Value::test_list(vec![
79                Value::test_record(record! {
80                    COLUMN_TAG_NAME =>     Value::test_string("remember"),
81                    COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
82                    COLUMN_CONTENT_NAME => Value::test_list(vec![
83                    Value::test_record(record! {
84                        COLUMN_TAG_NAME =>     Value::test_nothing(),
85                        COLUMN_ATTRS_NAME =>   Value::test_nothing(),
86                        COLUMN_CONTENT_NAME => Value::test_string("Event"),
87                        })],
88                    ),
89                    })],
90                ),
91            })),
92        }]
93    }
94}
95
96struct ParsingInfo {
97    span: Span,
98    keep_comments: bool,
99    keep_processing_instructions: bool,
100    allow_dtd: bool,
101}
102
103fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
104    let mut collected = IndexMap::new();
105    for a in attributes {
106        collected.insert(String::from(a.name()), Value::string(a.value(), info.span));
107    }
108    Value::record(collected.into_iter().collect(), info.span)
109}
110
111fn element_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Value {
112    let span = info.span;
113    let mut node = IndexMap::new();
114
115    let tag = n.tag_name().name().trim().to_string();
116    let tag = Value::string(tag, span);
117
118    let content: Vec<Value> = n
119        .children()
120        .filter_map(|node| from_node_to_value(&node, info))
121        .collect();
122    let content = Value::list(content, span);
123
124    let attributes = from_attributes_to_value(&n.attributes().collect::<Vec<_>>(), info);
125
126    node.insert(String::from(COLUMN_TAG_NAME), tag);
127    node.insert(String::from(COLUMN_ATTRS_NAME), attributes);
128    node.insert(String::from(COLUMN_CONTENT_NAME), content);
129
130    Value::record(node.into_iter().collect(), span)
131}
132
133fn text_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
134    let span = info.span;
135    let text = n.text().expect("Non-text node supplied to text_to_value");
136    let text = text.trim();
137    if text.is_empty() {
138        None
139    } else {
140        let mut node = IndexMap::new();
141        let content = Value::string(String::from(text), span);
142
143        node.insert(String::from(COLUMN_TAG_NAME), Value::nothing(span));
144        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
145        node.insert(String::from(COLUMN_CONTENT_NAME), content);
146
147        Some(Value::record(node.into_iter().collect(), span))
148    }
149}
150
151fn comment_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
152    if info.keep_comments {
153        let span = info.span;
154        let text = n
155            .text()
156            .expect("Non-comment node supplied to comment_to_value");
157
158        let mut node = IndexMap::new();
159        let content = Value::string(String::from(text), span);
160
161        node.insert(String::from(COLUMN_TAG_NAME), Value::string("!", span));
162        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
163        node.insert(String::from(COLUMN_CONTENT_NAME), content);
164
165        Some(Value::record(node.into_iter().collect(), span))
166    } else {
167        None
168    }
169}
170
171fn processing_instruction_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
172    if info.keep_processing_instructions {
173        let span = info.span;
174        let pi = n.pi()?;
175
176        let mut node = IndexMap::new();
177        // Add '?' before target to differentiate tags from pi targets
178        let tag = format!("?{}", pi.target);
179        let tag = Value::string(tag, span);
180        let content = pi
181            .value
182            .map_or_else(|| Value::nothing(span), |x| Value::string(x, span));
183
184        node.insert(String::from(COLUMN_TAG_NAME), tag);
185        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
186        node.insert(String::from(COLUMN_CONTENT_NAME), content);
187
188        Some(Value::record(node.into_iter().collect(), span))
189    } else {
190        None
191    }
192}
193
194fn from_node_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
195    match n.node_type() {
196        NodeType::Element => Some(element_to_value(n, info)),
197        NodeType::Text => text_to_value(n, info),
198        NodeType::Comment => comment_to_value(n, info),
199        NodeType::PI => processing_instruction_to_value(n, info),
200        _ => None,
201    }
202}
203
204fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value {
205    element_to_value(&d.root_element(), info)
206}
207
208fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
209    let options = ParsingOptions {
210        allow_dtd: info.allow_dtd,
211        ..Default::default()
212    };
213
214    let parsed = roxmltree::Document::parse_with_options(s, options)?;
215    Ok(from_document_to_value(&parsed, info))
216}
217
218fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result<PipelineData, ShellError> {
219    let (concat_string, span, metadata) = input.collect_string_strict(info.span)?;
220
221    match from_xml_string_to_value(&concat_string, info) {
222        Ok(x) => {
223            Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
224        }
225        Err(err) => Err(process_xml_parse_error(concat_string, err, span)),
226    }
227}
228
229fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError {
230    match err {
231        roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned(
232            "The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
233            source,
234            pos,
235        ),
236        roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned(
237            "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace  URI.",
238            source,
239            pos,
240        ),
241        roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned(
242            "The http://www.w3.org/2000/xmlns/  URI must not be declared.",
243            source,
244            pos,
245        ),
246        roxmltree::Error::InvalidElementNamePrefix(pos) => {
247            make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos)
248        }
249        roxmltree::Error::DuplicatedNamespace(namespace, pos) => make_xml_error_spanned(
250            format!("Namespace {namespace} was already defined on this element."),
251            source,
252            pos,
253        ),
254        roxmltree::Error::UnknownNamespace(prefix, pos) => {
255            make_xml_error_spanned(format!("Unknown prefix {}", prefix), source, pos)
256        }
257        roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => make_xml_error_spanned(
258            format!("Unexpected close tag {actual}, expected {expected}"),
259            source,
260            pos,
261        ),
262        roxmltree::Error::UnexpectedEntityCloseTag(pos) => {
263            make_xml_error_spanned("Entity value starts with a close tag.", source, pos)
264        }
265        roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned(
266            format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
267            source,
268            pos,
269        ),
270        roxmltree::Error::MalformedEntityReference(pos) => {
271            make_xml_error_spanned("Malformed entity reference.", source, pos)
272        }
273        roxmltree::Error::EntityReferenceLoop(pos) => {
274            make_xml_error_spanned("Possible entity reference loop.", source, pos)
275        }
276        roxmltree::Error::InvalidAttributeValue(pos) => {
277            make_xml_error_spanned("Attribute value cannot have a < character.", source, pos)
278        }
279        roxmltree::Error::DuplicatedAttribute(attribute, pos) => make_xml_error_spanned(
280            format!("Element has a duplicated attribute: {attribute}"),
281            source,
282            pos,
283        ),
284        roxmltree::Error::NoRootNode => {
285            make_xml_error("The XML document must have at least one element.", span)
286        }
287        roxmltree::Error::UnclosedRootNode => {
288            make_xml_error("The root node was opened but never closed.", span)
289        }
290        roxmltree::Error::DtdDetected => make_xml_error(
291            "XML document with DTD detected.\nDTDs are disabled by default to prevent denial-of-service attacks (use --allow-dtd to parse anyway)",
292            span,
293        ),
294        roxmltree::Error::NodesLimitReached => make_xml_error("Node limit was reached.", span),
295        roxmltree::Error::AttributesLimitReached => make_xml_error("Attribute limit reached", span),
296        roxmltree::Error::NamespacesLimitReached => make_xml_error("Namespace limit reached", span),
297        roxmltree::Error::UnexpectedDeclaration(pos) => make_xml_error_spanned(
298            "An XML document can have only one XML declaration and it must be at the start of the document.",
299            source,
300            pos,
301        ),
302        roxmltree::Error::InvalidName(pos) => make_xml_error_spanned("Invalid name.", source, pos),
303        roxmltree::Error::NonXmlChar(_, pos) => make_xml_error_spanned(
304            "Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>",
305            source,
306            pos,
307        ),
308        roxmltree::Error::InvalidChar(expected, actual, pos) => make_xml_error_spanned(
309            format!(
310                "Unexpected character {}, expected {}",
311                actual as char, expected as char
312            ),
313            source,
314            pos,
315        ),
316        roxmltree::Error::InvalidChar2(expected, actual, pos) => make_xml_error_spanned(
317            format!(
318                "Unexpected character {}, expected {}",
319                actual as char, expected
320            ),
321            source,
322            pos,
323        ),
324        roxmltree::Error::InvalidString(_, pos) => {
325            make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos)
326        }
327        roxmltree::Error::InvalidExternalID(pos) => {
328            make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos)
329        }
330        roxmltree::Error::InvalidComment(pos) => make_xml_error_spanned(
331            "A comment cannot contain `--` or end with `-`.",
332            source,
333            pos,
334        ),
335        roxmltree::Error::InvalidCharacterData(pos) => make_xml_error_spanned(
336            "Character Data node contains an invalid data. Currently, only `]]>` is not allowed.",
337            source,
338            pos,
339        ),
340        roxmltree::Error::UnknownToken(pos) => {
341            make_xml_error_spanned("Unknown token in XML.", source, pos)
342        }
343        roxmltree::Error::UnexpectedEndOfStream => {
344            make_xml_error("Unexpected end of stream while parsing XML.", span)
345        }
346    }
347}
348
349fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
350    ShellError::GenericError {
351        error: "Failed to parse XML".into(),
352        msg: msg.into(),
353        help: None,
354        span: Some(span),
355        inner: vec![],
356    }
357}
358
359fn make_xml_error_spanned(msg: impl Into<String>, src: String, pos: TextPos) -> ShellError {
360    let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src);
361    ShellError::OutsideSpannedLabeledError {
362        src,
363        error: "Failed to parse XML".into(),
364        msg: msg.into(),
365        span,
366    }
367}
368
369#[cfg(test)]
370mod tests {
371    use crate::Metadata;
372    use crate::MetadataSet;
373
374    use super::*;
375
376    use indexmap::IndexMap;
377    use indexmap::indexmap;
378    use nu_cmd_lang::eval_pipeline_without_terminal_expression;
379
380    fn string(input: impl Into<String>) -> Value {
381        Value::test_string(input)
382    }
383
384    fn attributes(entries: IndexMap<&str, &str>) -> Value {
385        Value::test_record(
386            entries
387                .into_iter()
388                .map(|(k, v)| (k.into(), string(v)))
389                .collect(),
390        )
391    }
392
393    fn table(list: &[Value]) -> Value {
394        Value::list(list.to_vec(), Span::test_data())
395    }
396
397    fn content_tag(
398        tag: impl Into<String>,
399        attrs: IndexMap<&str, &str>,
400        content: &[Value],
401    ) -> Value {
402        Value::test_record(record! {
403            COLUMN_TAG_NAME =>     string(tag),
404            COLUMN_ATTRS_NAME =>   attributes(attrs),
405            COLUMN_CONTENT_NAME => table(content),
406        })
407    }
408
409    fn content_string(value: impl Into<String>) -> Value {
410        Value::test_record(record! {
411            COLUMN_TAG_NAME =>     Value::nothing(Span::test_data()),
412            COLUMN_ATTRS_NAME =>   Value::nothing(Span::test_data()),
413            COLUMN_CONTENT_NAME => string(value),
414        })
415    }
416
417    fn parse(xml: &str) -> Result<Value, roxmltree::Error> {
418        let info = ParsingInfo {
419            span: Span::test_data(),
420            keep_comments: false,
421            keep_processing_instructions: false,
422            allow_dtd: false,
423        };
424        from_xml_string_to_value(xml, &info)
425    }
426
427    #[test]
428    fn parses_empty_element() -> Result<(), roxmltree::Error> {
429        let source = "<nu></nu>";
430
431        assert_eq!(parse(source)?, content_tag("nu", indexmap! {}, &[]));
432
433        Ok(())
434    }
435
436    #[test]
437    fn parses_element_with_text() -> Result<(), roxmltree::Error> {
438        let source = "<nu>La era de los tres caballeros</nu>";
439
440        assert_eq!(
441            parse(source)?,
442            content_tag(
443                "nu",
444                indexmap! {},
445                &[content_string("La era de los tres caballeros")]
446            )
447        );
448
449        Ok(())
450    }
451
452    #[test]
453    fn parses_element_with_elements() -> Result<(), roxmltree::Error> {
454        let source = "\
455<nu>
456    <dev>Andrés</dev>
457    <dev>JT</dev>
458    <dev>Yehuda</dev>
459</nu>";
460
461        assert_eq!(
462            parse(source)?,
463            content_tag(
464                "nu",
465                indexmap! {},
466                &[
467                    content_tag("dev", indexmap! {}, &[content_string("Andrés")]),
468                    content_tag("dev", indexmap! {}, &[content_string("JT")]),
469                    content_tag("dev", indexmap! {}, &[content_string("Yehuda")])
470                ]
471            )
472        );
473
474        Ok(())
475    }
476
477    #[test]
478    fn parses_element_with_attribute() -> Result<(), roxmltree::Error> {
479        let source = "\
480<nu version=\"2.0\">
481</nu>";
482
483        assert_eq!(
484            parse(source)?,
485            content_tag("nu", indexmap! {"version" => "2.0"}, &[])
486        );
487
488        Ok(())
489    }
490
491    #[test]
492    fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> {
493        let source = "\
494<nu version=\"2.0\">
495    <version>2.0</version>
496</nu>";
497
498        assert_eq!(
499            parse(source)?,
500            content_tag(
501                "nu",
502                indexmap! {"version" => "2.0"},
503                &[content_tag(
504                    "version",
505                    indexmap! {},
506                    &[content_string("2.0")]
507                )]
508            )
509        );
510
511        Ok(())
512    }
513
514    #[test]
515    fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> {
516        let source = "\
517<nu version=\"2.0\" age=\"25\">
518</nu>";
519
520        assert_eq!(
521            parse(source)?,
522            content_tag("nu", indexmap! {"version" => "2.0", "age" => "25"}, &[])
523        );
524
525        Ok(())
526    }
527
528    #[test]
529    fn test_examples() {
530        use crate::test_examples;
531
532        test_examples(FromXml {})
533    }
534
535    #[test]
536    fn test_content_type_metadata() {
537        let mut engine_state = Box::new(EngineState::new());
538        let delta = {
539            let mut working_set = StateWorkingSet::new(&engine_state);
540
541            working_set.add_decl(Box::new(FromXml {}));
542            working_set.add_decl(Box::new(Metadata {}));
543            working_set.add_decl(Box::new(MetadataSet {}));
544
545            working_set.render()
546        };
547
548        engine_state
549            .merge_delta(delta)
550            .expect("Error merging delta");
551
552        let cmd = r#"'<?xml version="1.0" encoding="UTF-8"?>
553<note>
554  <remember>Event</remember>
555</note>' | metadata set --content-type 'application/xml' --datasource-ls | from xml | metadata | $in"#;
556        let result = eval_pipeline_without_terminal_expression(
557            cmd,
558            std::env::temp_dir().as_ref(),
559            &mut engine_state,
560        );
561        assert_eq!(
562            Value::test_record(record!("source" => Value::test_string("ls"))),
563            result.expect("There should be a result")
564        )
565    }
566}