nu_command/formats/from/
xml.rs

1use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLUMN_TAG_NAME};
2use indexmap::IndexMap;
3use nu_engine::command_prelude::*;
4
5use roxmltree::{NodeType, ParsingOptions, TextPos};
6
7#[derive(Clone)]
8pub struct FromXml;
9
10impl Command for FromXml {
11    fn name(&self) -> &str {
12        "from xml"
13    }
14
15    fn signature(&self) -> Signature {
16        Signature::build("from xml")
17            .input_output_types(vec![(Type::String, Type::record())])
18            .switch("keep-comments", "add comment nodes to result", None)
19            .switch(
20                "allow-dtd",
21                "allow parsing documents with DTDs (may result in exponential entity expansion)",
22                None,
23            )
24            .switch(
25                "keep-pi",
26                "add processing instruction nodes to result",
27                None,
28            )
29            .category(Category::Formats)
30    }
31
32    fn description(&self) -> &str {
33        "Parse text as .xml and create record."
34    }
35
36    fn extra_description(&self) -> &str {
37        r#"Every XML entry is represented via a record with tag, attribute and content fields.
38To represent different types of entries different values are written to this fields:
391. Tag entry: `{tag: <tag name> attrs: {<attr name>: "<string value>" ...} content: [<entries>]}`
402. Comment entry: `{tag: '!' attrs: null content: "<comment string>"}`
413. Processing instruction (PI): `{tag: '?<pi name>' attrs: null content: "<pi content string>"}`
424. Text: `{tag: null attrs: null content: "<text>"}`.
43
44Unlike to xml command all null values are always present and text is never represented via plain
45string. This way content of every tag is always a table and is easier to parse"#
46    }
47
48    fn run(
49        &self,
50        engine_state: &EngineState,
51        stack: &mut Stack,
52        call: &Call,
53        input: PipelineData,
54    ) -> Result<PipelineData, ShellError> {
55        let head = call.head;
56        let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
57        let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
58        let allow_dtd = call.has_flag(engine_state, stack, "allow-dtd")?;
59        let info = ParsingInfo {
60            span: head,
61            keep_comments,
62            keep_processing_instructions,
63            allow_dtd,
64        };
65        from_xml(input, &info)
66    }
67
68    fn examples(&self) -> Vec<Example> {
69        vec![Example {
70            example: r#"'<?xml version="1.0" encoding="UTF-8"?>
71<note>
72  <remember>Event</remember>
73</note>' | from xml"#,
74            description: "Converts xml formatted string to record",
75            result: Some(Value::test_record(record! {
76                COLUMN_TAG_NAME =>     Value::test_string("note"),
77                COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
78                COLUMN_CONTENT_NAME => Value::test_list(vec![
79                Value::test_record(record! {
80                    COLUMN_TAG_NAME =>     Value::test_string("remember"),
81                    COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
82                    COLUMN_CONTENT_NAME => Value::test_list(vec![
83                    Value::test_record(record! {
84                        COLUMN_TAG_NAME =>     Value::test_nothing(),
85                        COLUMN_ATTRS_NAME =>   Value::test_nothing(),
86                        COLUMN_CONTENT_NAME => Value::test_string("Event"),
87                        })],
88                    ),
89                    })],
90                ),
91            })),
92        }]
93    }
94}
95
96struct ParsingInfo {
97    span: Span,
98    keep_comments: bool,
99    keep_processing_instructions: bool,
100    allow_dtd: bool,
101}
102
103fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
104    let mut collected = IndexMap::new();
105    for a in attributes {
106        collected.insert(String::from(a.name()), Value::string(a.value(), info.span));
107    }
108    Value::record(collected.into_iter().collect(), info.span)
109}
110
111fn element_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Value {
112    let span = info.span;
113    let mut node = IndexMap::new();
114
115    let tag = n.tag_name().name().trim().to_string();
116    let tag = Value::string(tag, span);
117
118    let content: Vec<Value> = n
119        .children()
120        .filter_map(|node| from_node_to_value(&node, info))
121        .collect();
122    let content = Value::list(content, span);
123
124    let attributes = from_attributes_to_value(&n.attributes().collect::<Vec<_>>(), info);
125
126    node.insert(String::from(COLUMN_TAG_NAME), tag);
127    node.insert(String::from(COLUMN_ATTRS_NAME), attributes);
128    node.insert(String::from(COLUMN_CONTENT_NAME), content);
129
130    Value::record(node.into_iter().collect(), span)
131}
132
133fn text_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
134    let span = info.span;
135    let text = n.text().expect("Non-text node supplied to text_to_value");
136    let text = text.trim();
137    if text.is_empty() {
138        None
139    } else {
140        let mut node = IndexMap::new();
141        let content = Value::string(String::from(text), span);
142
143        node.insert(String::from(COLUMN_TAG_NAME), Value::nothing(span));
144        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
145        node.insert(String::from(COLUMN_CONTENT_NAME), content);
146
147        Some(Value::record(node.into_iter().collect(), span))
148    }
149}
150
151fn comment_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
152    if info.keep_comments {
153        let span = info.span;
154        let text = n
155            .text()
156            .expect("Non-comment node supplied to comment_to_value");
157
158        let mut node = IndexMap::new();
159        let content = Value::string(String::from(text), span);
160
161        node.insert(String::from(COLUMN_TAG_NAME), Value::string("!", span));
162        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
163        node.insert(String::from(COLUMN_CONTENT_NAME), content);
164
165        Some(Value::record(node.into_iter().collect(), span))
166    } else {
167        None
168    }
169}
170
171fn processing_instruction_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
172    if info.keep_processing_instructions {
173        let span = info.span;
174        let pi = n.pi()?;
175
176        let mut node = IndexMap::new();
177        // Add '?' before target to differentiate tags from pi targets
178        let tag = format!("?{}", pi.target);
179        let tag = Value::string(tag, span);
180        let content = pi
181            .value
182            .map_or_else(|| Value::nothing(span), |x| Value::string(x, span));
183
184        node.insert(String::from(COLUMN_TAG_NAME), tag);
185        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
186        node.insert(String::from(COLUMN_CONTENT_NAME), content);
187
188        Some(Value::record(node.into_iter().collect(), span))
189    } else {
190        None
191    }
192}
193
194fn from_node_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
195    match n.node_type() {
196        NodeType::Element => Some(element_to_value(n, info)),
197        NodeType::Text => text_to_value(n, info),
198        NodeType::Comment => comment_to_value(n, info),
199        NodeType::PI => processing_instruction_to_value(n, info),
200        _ => None,
201    }
202}
203
204fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value {
205    element_to_value(&d.root_element(), info)
206}
207
208fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
209    let options = ParsingOptions {
210        allow_dtd: info.allow_dtd,
211        ..Default::default()
212    };
213
214    let parsed = roxmltree::Document::parse_with_options(s, options)?;
215    Ok(from_document_to_value(&parsed, info))
216}
217
218fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result<PipelineData, ShellError> {
219    let (concat_string, span, metadata) = input.collect_string_strict(info.span)?;
220
221    match from_xml_string_to_value(&concat_string, info) {
222        Ok(x) => {
223            Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
224        }
225        Err(err) => Err(process_xml_parse_error(concat_string, err, span)),
226    }
227}
228
229fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError {
230    match err {
231        roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned(
232            "The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
233            source, pos,
234        ),
235        roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned(
236            "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace  URI.",
237            source, pos,
238        ),
239        roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned(
240            "The http://www.w3.org/2000/xmlns/  URI must not be declared.",
241            source, pos,
242        ),
243        roxmltree::Error::InvalidElementNamePrefix(pos) => {
244            make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos)
245        }
246        roxmltree::Error::DuplicatedNamespace(namespace, pos) => {
247            make_xml_error_spanned(format!("Namespace {namespace} was already defined on this element."), source, pos)
248        }
249        roxmltree::Error::UnknownNamespace(prefix, pos) => {
250            make_xml_error_spanned(format!("Unknown prefix {}", prefix), source, pos)
251        }
252        roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => {
253            make_xml_error_spanned(format!("Unexpected close tag {actual}, expected {expected}"), source, pos)
254        }
255        roxmltree::Error::UnexpectedEntityCloseTag(pos) => {
256            make_xml_error_spanned("Entity value starts with a close tag.", source, pos)
257        }
258        roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned(
259            format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
260            source, pos,
261        ),
262        roxmltree::Error::MalformedEntityReference(pos) => {
263            make_xml_error_spanned("Malformed entity reference.", source, pos)
264        }
265        roxmltree::Error::EntityReferenceLoop(pos) => {
266            make_xml_error_spanned("Possible entity reference loop.", source, pos)
267        }
268        roxmltree::Error::InvalidAttributeValue(pos) => {
269            make_xml_error_spanned("Attribute value cannot have a < character.", source, pos)
270        }
271        roxmltree::Error::DuplicatedAttribute(attribute, pos) => {
272            make_xml_error_spanned(format!("Element has a duplicated attribute: {attribute}"), source, pos)
273        }
274        roxmltree::Error::NoRootNode => {
275            make_xml_error("The XML document must have at least one element.", span)
276        }
277        roxmltree::Error::UnclosedRootNode => {
278            make_xml_error("The root node was opened but never closed.", span)
279        }
280        roxmltree::Error::DtdDetected => make_xml_error(
281            "XML document with DTD detected.\nDTDs are disabled by default to prevent denial-of-service attacks (use --allow-dtd to parse anyway)",
282            span
283        ),
284        roxmltree::Error::NodesLimitReached => {
285            make_xml_error("Node limit was reached.", span)
286        }
287        roxmltree::Error::AttributesLimitReached => {
288            make_xml_error("Attribute limit reached", span)
289        }
290        roxmltree::Error::NamespacesLimitReached => {
291            make_xml_error("Namespace limit reached", span)
292        }
293        roxmltree::Error::UnexpectedDeclaration(pos) => {
294            make_xml_error_spanned("An XML document can have only one XML declaration and it must be at the start of the document.", source, pos)
295        }
296        roxmltree::Error::InvalidName(pos) => {
297            make_xml_error_spanned("Invalid name.", source, pos)
298        }
299        roxmltree::Error::NonXmlChar(_, pos) => {
300            make_xml_error_spanned("Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>", source, pos)
301        }
302        roxmltree::Error::InvalidChar(expected, actual, pos) => {
303            make_xml_error_spanned(
304                format!("Unexpected character {}, expected {}", actual as char, expected as char),
305                source,
306                pos
307            )
308        }
309        roxmltree::Error::InvalidChar2(expected, actual, pos) => {
310            make_xml_error_spanned(
311                format!("Unexpected character {}, expected {}", actual as char, expected),
312                source,
313                pos
314            )
315        }
316        roxmltree::Error::InvalidString(_, pos) => {
317            make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos)
318        }
319        roxmltree::Error::InvalidExternalID(pos) => {
320            make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos)
321        }
322        roxmltree::Error::InvalidComment(pos) => {
323            make_xml_error_spanned("A comment cannot contain `--` or end with `-`.", source, pos)
324        }
325        roxmltree::Error::InvalidCharacterData(pos) => {
326            make_xml_error_spanned("Character Data node contains an invalid data. Currently, only `]]>` is not allowed.", source, pos)
327        }
328        roxmltree::Error::UnknownToken(pos) => {
329            make_xml_error_spanned("Unknown token in XML.", source, pos)
330        }
331        roxmltree::Error::UnexpectedEndOfStream => {
332            make_xml_error("Unexpected end of stream while parsing XML.", span)
333        }
334    }
335}
336
337fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
338    ShellError::GenericError {
339        error: "Failed to parse XML".into(),
340        msg: msg.into(),
341        help: None,
342        span: Some(span),
343        inner: vec![],
344    }
345}
346
347fn make_xml_error_spanned(msg: impl Into<String>, src: String, pos: TextPos) -> ShellError {
348    let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src);
349    ShellError::OutsideSpannedLabeledError {
350        src,
351        error: "Failed to parse XML".into(),
352        msg: msg.into(),
353        span,
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use crate::Metadata;
360    use crate::MetadataSet;
361
362    use super::*;
363
364    use indexmap::indexmap;
365    use indexmap::IndexMap;
366    use nu_cmd_lang::eval_pipeline_without_terminal_expression;
367
368    fn string(input: impl Into<String>) -> Value {
369        Value::test_string(input)
370    }
371
372    fn attributes(entries: IndexMap<&str, &str>) -> Value {
373        Value::test_record(
374            entries
375                .into_iter()
376                .map(|(k, v)| (k.into(), string(v)))
377                .collect(),
378        )
379    }
380
381    fn table(list: &[Value]) -> Value {
382        Value::list(list.to_vec(), Span::test_data())
383    }
384
385    fn content_tag(
386        tag: impl Into<String>,
387        attrs: IndexMap<&str, &str>,
388        content: &[Value],
389    ) -> Value {
390        Value::test_record(record! {
391            COLUMN_TAG_NAME =>     string(tag),
392            COLUMN_ATTRS_NAME =>   attributes(attrs),
393            COLUMN_CONTENT_NAME => table(content),
394        })
395    }
396
397    fn content_string(value: impl Into<String>) -> Value {
398        Value::test_record(record! {
399            COLUMN_TAG_NAME =>     Value::nothing(Span::test_data()),
400            COLUMN_ATTRS_NAME =>   Value::nothing(Span::test_data()),
401            COLUMN_CONTENT_NAME => string(value),
402        })
403    }
404
405    fn parse(xml: &str) -> Result<Value, roxmltree::Error> {
406        let info = ParsingInfo {
407            span: Span::test_data(),
408            keep_comments: false,
409            keep_processing_instructions: false,
410            allow_dtd: false,
411        };
412        from_xml_string_to_value(xml, &info)
413    }
414
415    #[test]
416    fn parses_empty_element() -> Result<(), roxmltree::Error> {
417        let source = "<nu></nu>";
418
419        assert_eq!(parse(source)?, content_tag("nu", indexmap! {}, &[]));
420
421        Ok(())
422    }
423
424    #[test]
425    fn parses_element_with_text() -> Result<(), roxmltree::Error> {
426        let source = "<nu>La era de los tres caballeros</nu>";
427
428        assert_eq!(
429            parse(source)?,
430            content_tag(
431                "nu",
432                indexmap! {},
433                &[content_string("La era de los tres caballeros")]
434            )
435        );
436
437        Ok(())
438    }
439
440    #[test]
441    fn parses_element_with_elements() -> Result<(), roxmltree::Error> {
442        let source = "\
443<nu>
444    <dev>Andrés</dev>
445    <dev>JT</dev>
446    <dev>Yehuda</dev>
447</nu>";
448
449        assert_eq!(
450            parse(source)?,
451            content_tag(
452                "nu",
453                indexmap! {},
454                &[
455                    content_tag("dev", indexmap! {}, &[content_string("Andrés")]),
456                    content_tag("dev", indexmap! {}, &[content_string("JT")]),
457                    content_tag("dev", indexmap! {}, &[content_string("Yehuda")])
458                ]
459            )
460        );
461
462        Ok(())
463    }
464
465    #[test]
466    fn parses_element_with_attribute() -> Result<(), roxmltree::Error> {
467        let source = "\
468<nu version=\"2.0\">
469</nu>";
470
471        assert_eq!(
472            parse(source)?,
473            content_tag("nu", indexmap! {"version" => "2.0"}, &[])
474        );
475
476        Ok(())
477    }
478
479    #[test]
480    fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> {
481        let source = "\
482<nu version=\"2.0\">
483    <version>2.0</version>
484</nu>";
485
486        assert_eq!(
487            parse(source)?,
488            content_tag(
489                "nu",
490                indexmap! {"version" => "2.0"},
491                &[content_tag(
492                    "version",
493                    indexmap! {},
494                    &[content_string("2.0")]
495                )]
496            )
497        );
498
499        Ok(())
500    }
501
502    #[test]
503    fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> {
504        let source = "\
505<nu version=\"2.0\" age=\"25\">
506</nu>";
507
508        assert_eq!(
509            parse(source)?,
510            content_tag("nu", indexmap! {"version" => "2.0", "age" => "25"}, &[])
511        );
512
513        Ok(())
514    }
515
516    #[test]
517    fn test_examples() {
518        use crate::test_examples;
519
520        test_examples(FromXml {})
521    }
522
523    #[test]
524    fn test_content_type_metadata() {
525        let mut engine_state = Box::new(EngineState::new());
526        let delta = {
527            let mut working_set = StateWorkingSet::new(&engine_state);
528
529            working_set.add_decl(Box::new(FromXml {}));
530            working_set.add_decl(Box::new(Metadata {}));
531            working_set.add_decl(Box::new(MetadataSet {}));
532
533            working_set.render()
534        };
535
536        engine_state
537            .merge_delta(delta)
538            .expect("Error merging delta");
539
540        let cmd = r#"'<?xml version="1.0" encoding="UTF-8"?>
541<note>
542  <remember>Event</remember>
543</note>' | metadata set --content-type 'application/xml' --datasource-ls | from xml | metadata | $in"#;
544        let result = eval_pipeline_without_terminal_expression(
545            cmd,
546            std::env::temp_dir().as_ref(),
547            &mut engine_state,
548        );
549        assert_eq!(
550            Value::test_record(record!("source" => Value::test_string("ls"))),
551            result.expect("There should be a result")
552        )
553    }
554}