Skip to main content

nu_command/formats/from/
xml.rs

1use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLUMN_TAG_NAME};
2use indexmap::IndexMap;
3use nu_engine::command_prelude::*;
4use nu_protocol::shell_error::generic::GenericError;
5
6use roxmltree::{NodeType, ParsingOptions, TextPos};
7
8#[derive(Clone)]
9pub struct FromXml;
10
11impl Command for FromXml {
12    fn name(&self) -> &str {
13        "from xml"
14    }
15
16    fn signature(&self) -> Signature {
17        Signature::build("from xml")
18            .input_output_types(vec![(Type::String, Type::record())])
19            .switch("keep-comments", "Add comment nodes to result.", None)
20            .switch(
21                "allow-dtd",
22                "Allow parsing documents with DTDs (may result in exponential entity expansion).",
23                None,
24            )
25            .switch(
26                "keep-pi",
27                "Add processing instruction nodes to result.",
28                None,
29            )
30            .category(Category::Formats)
31    }
32
33    fn description(&self) -> &str {
34        "Parse text as .xml and create record."
35    }
36
37    fn extra_description(&self) -> &str {
38        r#"Every XML entry is represented via a record with tag, attribute and content fields.
39To represent different types of entries different values are written to this fields:
401. Tag entry: `{tag: <tag name> attrs: {<attr name>: "<string value>" ...} content: [<entries>]}`
412. Comment entry: `{tag: '!' attrs: null content: "<comment string>"}`
423. Processing instruction (PI): `{tag: '?<pi name>' attrs: null content: "<pi content string>"}`
434. Text: `{tag: null attrs: null content: "<text>"}`.
44
45Unlike to xml command all null values are always present and text is never represented via plain
46string. This way content of every tag is always a table and is easier to parse"#
47    }
48
49    fn run(
50        &self,
51        engine_state: &EngineState,
52        stack: &mut Stack,
53        call: &Call,
54        input: PipelineData,
55    ) -> Result<PipelineData, ShellError> {
56        let head = call.head;
57        let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
58        let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
59        let allow_dtd = call.has_flag(engine_state, stack, "allow-dtd")?;
60        let info = ParsingInfo {
61            span: head,
62            keep_comments,
63            keep_processing_instructions,
64            allow_dtd,
65        };
66        from_xml(input, &info)
67    }
68
69    fn examples(&self) -> Vec<Example<'_>> {
70        vec![Example {
71            example: r#"'<?xml version="1.0" encoding="UTF-8"?>
72<note>
73  <remember>Event</remember>
74</note>' | from xml"#,
75            description: "Converts xml formatted string to record.",
76            result: Some(Value::test_record(record! {
77                COLUMN_TAG_NAME =>     Value::test_string("note"),
78                COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
79                COLUMN_CONTENT_NAME => Value::test_list(vec![
80                Value::test_record(record! {
81                    COLUMN_TAG_NAME =>     Value::test_string("remember"),
82                    COLUMN_ATTRS_NAME =>   Value::test_record(Record::new()),
83                    COLUMN_CONTENT_NAME => Value::test_list(vec![
84                    Value::test_record(record! {
85                        COLUMN_TAG_NAME =>     Value::test_nothing(),
86                        COLUMN_ATTRS_NAME =>   Value::test_nothing(),
87                        COLUMN_CONTENT_NAME => Value::test_string("Event"),
88                        })],
89                    ),
90                    })],
91                ),
92            })),
93        }]
94    }
95}
96
97struct ParsingInfo {
98    span: Span,
99    keep_comments: bool,
100    keep_processing_instructions: bool,
101    allow_dtd: bool,
102}
103
104fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
105    let mut collected = IndexMap::new();
106    for a in attributes {
107        collected.insert(String::from(a.name()), Value::string(a.value(), info.span));
108    }
109    Value::record(collected.into_iter().collect(), info.span)
110}
111
112fn element_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Value {
113    let span = info.span;
114    let mut node = IndexMap::new();
115
116    let tag = n.tag_name().name().trim().to_string();
117    let tag = Value::string(tag, span);
118
119    let content: Vec<Value> = n
120        .children()
121        .filter_map(|node| from_node_to_value(&node, info))
122        .collect();
123    let content = Value::list(content, span);
124
125    let attributes = from_attributes_to_value(&n.attributes().collect::<Vec<_>>(), info);
126
127    node.insert(String::from(COLUMN_TAG_NAME), tag);
128    node.insert(String::from(COLUMN_ATTRS_NAME), attributes);
129    node.insert(String::from(COLUMN_CONTENT_NAME), content);
130
131    Value::record(node.into_iter().collect(), span)
132}
133
134fn text_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
135    let span = info.span;
136    let text = n.text().expect("Non-text node supplied to text_to_value");
137    let text = text.trim();
138    if text.is_empty() {
139        None
140    } else {
141        let mut node = IndexMap::new();
142        let content = Value::string(String::from(text), span);
143
144        node.insert(String::from(COLUMN_TAG_NAME), Value::nothing(span));
145        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
146        node.insert(String::from(COLUMN_CONTENT_NAME), content);
147
148        Some(Value::record(node.into_iter().collect(), span))
149    }
150}
151
152fn comment_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
153    if info.keep_comments {
154        let span = info.span;
155        let text = n
156            .text()
157            .expect("Non-comment node supplied to comment_to_value");
158
159        let mut node = IndexMap::new();
160        let content = Value::string(String::from(text), span);
161
162        node.insert(String::from(COLUMN_TAG_NAME), Value::string("!", span));
163        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
164        node.insert(String::from(COLUMN_CONTENT_NAME), content);
165
166        Some(Value::record(node.into_iter().collect(), span))
167    } else {
168        None
169    }
170}
171
172fn processing_instruction_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
173    if info.keep_processing_instructions {
174        let span = info.span;
175        let pi = n.pi()?;
176
177        let mut node = IndexMap::new();
178        // Add '?' before target to differentiate tags from pi targets
179        let tag = format!("?{}", pi.target);
180        let tag = Value::string(tag, span);
181        let content = pi
182            .value
183            .map_or_else(|| Value::nothing(span), |x| Value::string(x, span));
184
185        node.insert(String::from(COLUMN_TAG_NAME), tag);
186        node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
187        node.insert(String::from(COLUMN_CONTENT_NAME), content);
188
189        Some(Value::record(node.into_iter().collect(), span))
190    } else {
191        None
192    }
193}
194
195fn from_node_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
196    match n.node_type() {
197        NodeType::Element => Some(element_to_value(n, info)),
198        NodeType::Text => text_to_value(n, info),
199        NodeType::Comment => comment_to_value(n, info),
200        NodeType::PI => processing_instruction_to_value(n, info),
201        _ => None,
202    }
203}
204
205fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value {
206    element_to_value(&d.root_element(), info)
207}
208
209fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
210    let options = ParsingOptions {
211        allow_dtd: info.allow_dtd,
212        ..Default::default()
213    };
214
215    let parsed = roxmltree::Document::parse_with_options(s, options)?;
216    Ok(from_document_to_value(&parsed, info))
217}
218
219fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result<PipelineData, ShellError> {
220    let (concat_string, span, metadata) = input.collect_string_strict(info.span)?;
221
222    match from_xml_string_to_value(&concat_string, info) {
223        Ok(x) => {
224            Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
225        }
226        Err(err) => Err(process_xml_parse_error(concat_string, err, span)),
227    }
228}
229
230fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError {
231    match err {
232        roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned(
233            "The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
234            source,
235            pos,
236        ),
237        roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned(
238            "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace  URI.",
239            source,
240            pos,
241        ),
242        roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned(
243            "The http://www.w3.org/2000/xmlns/  URI must not be declared.",
244            source,
245            pos,
246        ),
247        roxmltree::Error::InvalidElementNamePrefix(pos) => {
248            make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos)
249        }
250        roxmltree::Error::DuplicatedNamespace(namespace, pos) => make_xml_error_spanned(
251            format!("Namespace {namespace} was already defined on this element."),
252            source,
253            pos,
254        ),
255        roxmltree::Error::UnknownNamespace(prefix, pos) => {
256            make_xml_error_spanned(format!("Unknown prefix {prefix}"), source, pos)
257        }
258        roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => make_xml_error_spanned(
259            format!("Unexpected close tag {actual}, expected {expected}"),
260            source,
261            pos,
262        ),
263        roxmltree::Error::UnexpectedEntityCloseTag(pos) => {
264            make_xml_error_spanned("Entity value starts with a close tag.", source, pos)
265        }
266        roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned(
267            format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
268            source,
269            pos,
270        ),
271        roxmltree::Error::MalformedEntityReference(pos) => {
272            make_xml_error_spanned("Malformed entity reference.", source, pos)
273        }
274        roxmltree::Error::EntityReferenceLoop(pos) => {
275            make_xml_error_spanned("Possible entity reference loop.", source, pos)
276        }
277        roxmltree::Error::InvalidAttributeValue(pos) => {
278            make_xml_error_spanned("Attribute value cannot have a < character.", source, pos)
279        }
280        roxmltree::Error::DuplicatedAttribute(attribute, pos) => make_xml_error_spanned(
281            format!("Element has a duplicated attribute: {attribute}"),
282            source,
283            pos,
284        ),
285        roxmltree::Error::NoRootNode => {
286            make_xml_error("The XML document must have at least one element.", span)
287        }
288        roxmltree::Error::UnclosedRootNode => {
289            make_xml_error("The root node was opened but never closed.", span)
290        }
291        roxmltree::Error::DtdDetected => make_xml_error(
292            "XML document with DTD detected.\nDTDs are disabled by default to prevent denial-of-service attacks (use `from xml --allow-dtd` to bypass this functionality)",
293            span,
294        ),
295        roxmltree::Error::NodesLimitReached => make_xml_error("Node limit was reached.", span),
296        roxmltree::Error::AttributesLimitReached => make_xml_error("Attribute limit reached", span),
297        roxmltree::Error::NamespacesLimitReached => make_xml_error("Namespace limit reached", span),
298        roxmltree::Error::UnexpectedDeclaration(pos) => make_xml_error_spanned(
299            "An XML document can have only one XML declaration and it must be at the start of the document.",
300            source,
301            pos,
302        ),
303        roxmltree::Error::InvalidName(pos) => make_xml_error_spanned("Invalid name.", source, pos),
304        roxmltree::Error::NonXmlChar(_, pos) => make_xml_error_spanned(
305            "Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>",
306            source,
307            pos,
308        ),
309        roxmltree::Error::InvalidChar(expected, actual, pos) => make_xml_error_spanned(
310            format!(
311                "Unexpected character {}, expected {}",
312                actual as char, expected as char
313            ),
314            source,
315            pos,
316        ),
317        roxmltree::Error::InvalidChar2(expected, actual, pos) => make_xml_error_spanned(
318            format!(
319                "Unexpected character {}, expected {}",
320                actual as char, expected
321            ),
322            source,
323            pos,
324        ),
325        roxmltree::Error::InvalidString(_, pos) => {
326            make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos)
327        }
328        roxmltree::Error::InvalidExternalID(pos) => {
329            make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos)
330        }
331        roxmltree::Error::InvalidComment(pos) => make_xml_error_spanned(
332            "A comment cannot contain `--` or end with `-`.",
333            source,
334            pos,
335        ),
336        roxmltree::Error::InvalidCharacterData(pos) => make_xml_error_spanned(
337            "Character Data node contains an invalid data. Currently, only `]]>` is not allowed.",
338            source,
339            pos,
340        ),
341        roxmltree::Error::UnknownToken(pos) => {
342            make_xml_error_spanned("Unknown token in XML.", source, pos)
343        }
344        roxmltree::Error::UnexpectedEndOfStream => {
345            make_xml_error("Unexpected end of stream while parsing XML.", span)
346        }
347    }
348}
349
350fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
351    ShellError::Generic(GenericError::new("Failed to parse XML", msg.into(), span))
352}
353
354fn make_xml_error_spanned(msg: impl Into<String>, src: String, pos: TextPos) -> ShellError {
355    let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src);
356    ShellError::OutsideSpannedLabeledError {
357        src,
358        error: "Failed to parse XML".into(),
359        msg: msg.into(),
360        span,
361    }
362}
363
364#[cfg(test)]
365mod tests {
366    use crate::Metadata;
367    use crate::MetadataSet;
368    use crate::Reject;
369
370    use super::*;
371
372    use indexmap::IndexMap;
373    use indexmap::indexmap;
374    use nu_cmd_lang::eval_pipeline_without_terminal_expression;
375
376    fn string(input: impl Into<String>) -> Value {
377        Value::test_string(input)
378    }
379
380    fn attributes(entries: IndexMap<&str, &str>) -> Value {
381        Value::test_record(
382            entries
383                .into_iter()
384                .map(|(k, v)| (k.into(), string(v)))
385                .collect(),
386        )
387    }
388
389    fn table(list: &[Value]) -> Value {
390        Value::list(list.to_vec(), Span::test_data())
391    }
392
393    fn content_tag(
394        tag: impl Into<String>,
395        attrs: IndexMap<&str, &str>,
396        content: &[Value],
397    ) -> Value {
398        Value::test_record(record! {
399            COLUMN_TAG_NAME =>     string(tag),
400            COLUMN_ATTRS_NAME =>   attributes(attrs),
401            COLUMN_CONTENT_NAME => table(content),
402        })
403    }
404
405    fn content_string(value: impl Into<String>) -> Value {
406        Value::test_record(record! {
407            COLUMN_TAG_NAME =>     Value::nothing(Span::test_data()),
408            COLUMN_ATTRS_NAME =>   Value::nothing(Span::test_data()),
409            COLUMN_CONTENT_NAME => string(value),
410        })
411    }
412
413    fn parse(xml: &str) -> Result<Value, roxmltree::Error> {
414        let info = ParsingInfo {
415            span: Span::test_data(),
416            keep_comments: false,
417            keep_processing_instructions: false,
418            allow_dtd: false,
419        };
420        from_xml_string_to_value(xml, &info)
421    }
422
423    #[test]
424    fn parses_empty_element() -> Result<(), roxmltree::Error> {
425        let source = "<nu></nu>";
426
427        assert_eq!(parse(source)?, content_tag("nu", indexmap! {}, &[]));
428
429        Ok(())
430    }
431
432    #[test]
433    fn parses_element_with_text() -> Result<(), roxmltree::Error> {
434        let source = "<nu>La era de los tres caballeros</nu>";
435
436        assert_eq!(
437            parse(source)?,
438            content_tag(
439                "nu",
440                indexmap! {},
441                &[content_string("La era de los tres caballeros")]
442            )
443        );
444
445        Ok(())
446    }
447
448    #[test]
449    fn parses_element_with_elements() -> Result<(), roxmltree::Error> {
450        let source = "\
451<nu>
452    <dev>Andrés</dev>
453    <dev>JT</dev>
454    <dev>Yehuda</dev>
455</nu>";
456
457        assert_eq!(
458            parse(source)?,
459            content_tag(
460                "nu",
461                indexmap! {},
462                &[
463                    content_tag("dev", indexmap! {}, &[content_string("Andrés")]),
464                    content_tag("dev", indexmap! {}, &[content_string("JT")]),
465                    content_tag("dev", indexmap! {}, &[content_string("Yehuda")])
466                ]
467            )
468        );
469
470        Ok(())
471    }
472
473    #[test]
474    fn parses_element_with_attribute() -> Result<(), roxmltree::Error> {
475        let source = "\
476<nu version=\"2.0\">
477</nu>";
478
479        assert_eq!(
480            parse(source)?,
481            content_tag("nu", indexmap! {"version" => "2.0"}, &[])
482        );
483
484        Ok(())
485    }
486
487    #[test]
488    fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> {
489        let source = "\
490<nu version=\"2.0\">
491    <version>2.0</version>
492</nu>";
493
494        assert_eq!(
495            parse(source)?,
496            content_tag(
497                "nu",
498                indexmap! {"version" => "2.0"},
499                &[content_tag(
500                    "version",
501                    indexmap! {},
502                    &[content_string("2.0")]
503                )]
504            )
505        );
506
507        Ok(())
508    }
509
510    #[test]
511    fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> {
512        let source = "\
513<nu version=\"2.0\" age=\"25\">
514</nu>";
515
516        assert_eq!(
517            parse(source)?,
518            content_tag("nu", indexmap! {"version" => "2.0", "age" => "25"}, &[])
519        );
520
521        Ok(())
522    }
523
524    #[test]
525    fn test_examples() -> nu_test_support::Result {
526        nu_test_support::test().examples(FromXml)
527    }
528
529    #[test]
530    fn test_content_type_metadata() {
531        let mut engine_state = Box::new(EngineState::new());
532        let delta = {
533            let mut working_set = StateWorkingSet::new(&engine_state);
534
535            working_set.add_decl(Box::new(FromXml {}));
536            working_set.add_decl(Box::new(Metadata {}));
537            working_set.add_decl(Box::new(MetadataSet {}));
538            working_set.add_decl(Box::new(Reject {}));
539
540            working_set.render()
541        };
542
543        engine_state
544            .merge_delta(delta)
545            .expect("Error merging delta");
546
547        let cmd = r#"'<?xml version="1.0" encoding="UTF-8"?>
548<note>
549  <remember>Event</remember>
550</note>' | metadata set --content-type 'application/xml' --path-columns [name] | from xml | metadata | reject span | $in"#;
551        let result = eval_pipeline_without_terminal_expression(
552            cmd,
553            std::env::temp_dir().as_ref(),
554            &mut engine_state,
555        );
556        assert_eq!(
557            Value::test_record(
558                record!("path_columns" => Value::test_list(vec![Value::test_string("name")]))
559            ),
560            result.expect("There should be a result")
561        )
562    }
563}