1use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLUMN_TAG_NAME};
2use indexmap::IndexMap;
3use nu_engine::command_prelude::*;
4use nu_protocol::{
5 DEFAULT_ERROR_CONTEXT, Signals, shell_error::generic::GenericError, truncated_source_window,
6};
7
8use roxmltree::{NodeType, ParsingOptions, TextPos};
9
10#[derive(Clone)]
11pub struct FromXml;
12
13impl Command for FromXml {
14 fn name(&self) -> &str {
15 "from xml"
16 }
17
18 fn signature(&self) -> Signature {
19 Signature::build("from xml")
20 .input_output_types(vec![(Type::String, Type::record())])
21 .switch("keep-comments", "Add comment nodes to result.", None)
22 .switch(
23 "allow-dtd",
24 "Allow parsing documents with DTDs (may result in exponential entity expansion).",
25 None,
26 )
27 .switch(
28 "keep-pi",
29 "Add processing instruction nodes to result.",
30 None,
31 )
32 .category(Category::Formats)
33 }
34
35 fn description(&self) -> &str {
36 "Parse text as .xml and create record."
37 }
38
39 fn extra_description(&self) -> &str {
40 r#"Every XML entry is represented via a record with tag, attribute and content fields.
41To represent different types of entries different values are written to this fields:
421. Tag entry: `{tag: <tag name> attrs: {<attr name>: "<string value>" ...} content: [<entries>]}`
432. Comment entry: `{tag: '!' attrs: null content: "<comment string>"}`
443. Processing instruction (PI): `{tag: '?<pi name>' attrs: null content: "<pi content string>"}`
454. Text: `{tag: null attrs: null content: "<text>"}`.
46
47Unlike to xml command all null values are always present and text is never represented via plain
48string. This way content of every tag is always a table and is easier to parse"#
49 }
50
51 fn run(
52 &self,
53 engine_state: &EngineState,
54 stack: &mut Stack,
55 call: &Call,
56 input: PipelineData,
57 ) -> Result<PipelineData, ShellError> {
58 let head = call.head;
59 let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
60 let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
61 let allow_dtd = call.has_flag(engine_state, stack, "allow-dtd")?;
62 let info = ParsingInfo {
63 span: head,
64 keep_comments,
65 keep_processing_instructions,
66 allow_dtd,
67 };
68 from_xml(input, &info, engine_state.signals())
69 }
70
71 fn examples(&self) -> Vec<Example<'_>> {
72 vec![Example {
73 example: r#"'<?xml version="1.0" encoding="UTF-8"?>
74<note>
75 <remember>Event</remember>
76</note>' | from xml"#,
77 description: "Converts xml formatted string to record.",
78 result: Some(Value::test_record(record! {
79 COLUMN_TAG_NAME => Value::test_string("note"),
80 COLUMN_ATTRS_NAME => Value::test_record(Record::new()),
81 COLUMN_CONTENT_NAME => Value::test_list(vec![
82 Value::test_record(record! {
83 COLUMN_TAG_NAME => Value::test_string("remember"),
84 COLUMN_ATTRS_NAME => Value::test_record(Record::new()),
85 COLUMN_CONTENT_NAME => Value::test_list(vec![
86 Value::test_record(record! {
87 COLUMN_TAG_NAME => Value::test_nothing(),
88 COLUMN_ATTRS_NAME => Value::test_nothing(),
89 COLUMN_CONTENT_NAME => Value::test_string("Event"),
90 })],
91 ),
92 })],
93 ),
94 })),
95 }]
96 }
97}
98
99struct ParsingInfo {
100 span: Span,
101 keep_comments: bool,
102 keep_processing_instructions: bool,
103 allow_dtd: bool,
104}
105
106fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
107 let mut collected = IndexMap::new();
108 for a in attributes {
109 collected.insert(String::from(a.name()), Value::string(a.value(), info.span));
110 }
111 Value::record(collected.into_iter().collect(), info.span)
112}
113
114fn element_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Value {
115 let span = info.span;
116 let mut node = IndexMap::new();
117
118 let tag = n.tag_name().name().trim().to_string();
119 let tag = Value::string(tag, span);
120
121 let content: Vec<Value> = n
122 .children()
123 .filter_map(|node| from_node_to_value(&node, info))
124 .collect();
125 let content = Value::list(content, span);
126
127 let attributes = from_attributes_to_value(&n.attributes().collect::<Vec<_>>(), info);
128
129 node.insert(String::from(COLUMN_TAG_NAME), tag);
130 node.insert(String::from(COLUMN_ATTRS_NAME), attributes);
131 node.insert(String::from(COLUMN_CONTENT_NAME), content);
132
133 Value::record(node.into_iter().collect(), span)
134}
135
136fn text_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
137 let span = info.span;
138 let text = n.text().expect("Non-text node supplied to text_to_value");
139 let text = text.trim();
140 if text.is_empty() {
141 None
142 } else {
143 let mut node = IndexMap::new();
144 let content = Value::string(String::from(text), span);
145
146 node.insert(String::from(COLUMN_TAG_NAME), Value::nothing(span));
147 node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
148 node.insert(String::from(COLUMN_CONTENT_NAME), content);
149
150 Some(Value::record(node.into_iter().collect(), span))
151 }
152}
153
154fn comment_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
155 if info.keep_comments {
156 let span = info.span;
157 let text = n
158 .text()
159 .expect("Non-comment node supplied to comment_to_value");
160
161 let mut node = IndexMap::new();
162 let content = Value::string(String::from(text), span);
163
164 node.insert(String::from(COLUMN_TAG_NAME), Value::string("!", span));
165 node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
166 node.insert(String::from(COLUMN_CONTENT_NAME), content);
167
168 Some(Value::record(node.into_iter().collect(), span))
169 } else {
170 None
171 }
172}
173
174fn processing_instruction_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
175 if info.keep_processing_instructions {
176 let span = info.span;
177 let pi = n.pi()?;
178
179 let mut node = IndexMap::new();
180 let tag = format!("?{}", pi.target);
182 let tag = Value::string(tag, span);
183 let content = pi
184 .value
185 .map_or_else(|| Value::nothing(span), |x| Value::string(x, span));
186
187 node.insert(String::from(COLUMN_TAG_NAME), tag);
188 node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
189 node.insert(String::from(COLUMN_CONTENT_NAME), content);
190
191 Some(Value::record(node.into_iter().collect(), span))
192 } else {
193 None
194 }
195}
196
197fn from_node_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
198 match n.node_type() {
199 NodeType::Element => Some(element_to_value(n, info)),
200 NodeType::Text => text_to_value(n, info),
201 NodeType::Comment => comment_to_value(n, info),
202 NodeType::PI => processing_instruction_to_value(n, info),
203 _ => None,
204 }
205}
206
207fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value {
208 element_to_value(&d.root_element(), info)
209}
210
211fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
212 let options = ParsingOptions {
213 allow_dtd: info.allow_dtd,
214 ..Default::default()
215 };
216
217 let parsed = roxmltree::Document::parse_with_options(s, options)?;
218 Ok(from_document_to_value(&parsed, info))
219}
220
221fn from_xml(
222 input: PipelineData,
223 info: &ParsingInfo,
224 signals: &Signals,
225) -> Result<PipelineData, ShellError> {
226 let (concat_string, span, metadata) = input.collect_string_strict(info.span)?;
227
228 match from_xml_string_to_value(&concat_string, info) {
229 Ok(x) => {
230 Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
231 }
232 Err(err) => Err(process_xml_parse_error(concat_string, err, span, signals)),
233 }
234}
235
236fn process_xml_parse_error(
237 source: impl AsRef<str>,
238 err: roxmltree::Error,
239 span: Span,
240 signals: &Signals,
241) -> ShellError {
242 let source = source.as_ref();
243 match err {
244 roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_err(
245 source,
246 span,
247 signals,
248 "The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
249 pos,
250 ),
251 roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_err(
252 source,
253 span,
254 signals,
255 "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace URI.",
256 pos,
257 ),
258 roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_err(
259 source,
260 span,
261 signals,
262 "The http://www.w3.org/2000/xmlns/ URI must not be declared.",
263 pos,
264 ),
265 roxmltree::Error::InvalidElementNamePrefix(pos) => make_xml_err(
266 source,
267 span,
268 signals,
269 "xmlns can't be used as an element prefix.",
270 pos,
271 ),
272 roxmltree::Error::DuplicatedNamespace(namespace, pos) => make_xml_err(
273 source,
274 span,
275 signals,
276 format!("Namespace {namespace} was already defined on this element."),
277 pos,
278 ),
279 roxmltree::Error::UnknownNamespace(prefix, pos) => make_xml_err(
280 source,
281 span,
282 signals,
283 format!("Unknown prefix {prefix}"),
284 pos,
285 ),
286 roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => make_xml_err(
287 source,
288 span,
289 signals,
290 format!("Unexpected close tag {actual}, expected {expected}"),
291 pos,
292 ),
293 roxmltree::Error::UnexpectedEntityCloseTag(pos) => make_xml_err(
294 source,
295 span,
296 signals,
297 "Entity value starts with a close tag.",
298 pos,
299 ),
300 roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_err(
301 source,
302 span,
303 signals,
304 format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
305 pos,
306 ),
307 roxmltree::Error::MalformedEntityReference(pos) => {
308 make_xml_err(source, span, signals, "Malformed entity reference.", pos)
309 }
310 roxmltree::Error::EntityReferenceLoop(pos) => make_xml_err(
311 source,
312 span,
313 signals,
314 "Possible entity reference loop.",
315 pos,
316 ),
317 roxmltree::Error::InvalidAttributeValue(pos) => make_xml_err(
318 source,
319 span,
320 signals,
321 "Attribute value cannot have a < character.",
322 pos,
323 ),
324 roxmltree::Error::DuplicatedAttribute(attribute, pos) => make_xml_err(
325 source,
326 span,
327 signals,
328 format!("Element has a duplicated attribute: {attribute}"),
329 pos,
330 ),
331 roxmltree::Error::NoRootNode => {
332 make_xml_error("The XML document must have at least one element.", span)
333 }
334 roxmltree::Error::UnclosedRootNode => {
335 make_xml_error("The root node was opened but never closed.", span)
336 }
337 roxmltree::Error::DtdDetected => make_xml_error(
338 "XML document with DTD detected.\nDTDs are disabled by default to prevent denial-of-service attacks (use `from xml --allow-dtd` to bypass this functionality)",
339 span,
340 ),
341 roxmltree::Error::NodesLimitReached => make_xml_error("Node limit was reached.", span),
342 roxmltree::Error::AttributesLimitReached => make_xml_error("Attribute limit reached", span),
343 roxmltree::Error::NamespacesLimitReached => make_xml_error("Namespace limit reached", span),
344 roxmltree::Error::UnexpectedDeclaration(pos) => make_xml_err(
345 source,
346 span,
347 signals,
348 "An XML document can have only one XML declaration and it must be at the start of the document.",
349 pos,
350 ),
351 roxmltree::Error::InvalidName(pos) => {
352 make_xml_err(source, span, signals, "Invalid name.", pos)
353 }
354 roxmltree::Error::NonXmlChar(_, pos) => make_xml_err(
355 source,
356 span,
357 signals,
358 "Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>",
359 pos,
360 ),
361 roxmltree::Error::InvalidChar(expected, actual, pos) => make_xml_err(
362 source,
363 span,
364 signals,
365 format!(
366 "Unexpected character {}, expected {}",
367 actual as char, expected as char
368 ),
369 pos,
370 ),
371 roxmltree::Error::InvalidChar2(expected, actual, pos) => make_xml_err(
372 source,
373 span,
374 signals,
375 format!(
376 "Unexpected character {}, expected {}",
377 actual as char, expected
378 ),
379 pos,
380 ),
381 roxmltree::Error::InvalidString(_, pos) => make_xml_err(
382 source,
383 span,
384 signals,
385 "Invalid/unexpected string in XML.",
386 pos,
387 ),
388 roxmltree::Error::InvalidExternalID(pos) => {
389 make_xml_err(source, span, signals, "Invalid ExternalID in the DTD.", pos)
390 }
391 roxmltree::Error::EntityResolver(pos, msg) => make_xml_err(
392 source,
393 span,
394 signals,
395 format!("Resolving the given entity yielded an error: {msg}."),
396 pos,
397 ),
398 roxmltree::Error::InvalidComment(pos) => make_xml_err(
399 source,
400 span,
401 signals,
402 "A comment cannot contain `--` or end with `-`.",
403 pos,
404 ),
405 roxmltree::Error::InvalidCharacterData(pos) => make_xml_err(
406 source,
407 span,
408 signals,
409 "Character Data node contains an invalid data. Currently, only `]]>` is not allowed.",
410 pos,
411 ),
412 roxmltree::Error::UnknownToken(pos) => {
413 make_xml_err(source, span, signals, "Unknown token in XML.", pos)
414 }
415 roxmltree::Error::UnexpectedEndOfStream => {
416 make_xml_error("Unexpected end of stream while parsing XML.", span)
417 }
418 }
419}
420
421fn make_xml_err(
422 source: &str,
423 span: Span,
424 signals: &Signals,
425 msg: impl Into<String>,
426 pos: TextPos,
427) -> ShellError {
428 match Span::try_from_row_column(pos.row as usize, pos.col as usize, source, &span, signals) {
429 Ok(byte_span) => {
430 let (src, label_span) =
431 truncated_source_window(source, byte_span, DEFAULT_ERROR_CONTEXT);
432 ShellError::OutsideSpannedLabeledError {
433 src,
434 error: "Failed to parse XML".into(),
435 msg: msg.into(),
436 span: label_span,
437 }
438 }
439 Err(e) => e,
440 }
441}
442
443fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
444 ShellError::Generic(GenericError::new("Failed to parse XML", msg.into(), span))
445}
446
447#[cfg(test)]
448mod tests {
449 use crate::Metadata;
450 use crate::MetadataSet;
451 use crate::Reject;
452 use roxmltree::ParsingOptions;
453
454 use super::*;
455
456 use indexmap::IndexMap;
457 use indexmap::indexmap;
458 use nu_cmd_lang::eval_pipeline_without_terminal_expression;
459
460 fn string(input: impl Into<String>) -> Value {
461 Value::test_string(input)
462 }
463
464 fn attributes(entries: IndexMap<&str, &str>) -> Value {
465 Value::test_record(
466 entries
467 .into_iter()
468 .map(|(k, v)| (k.into(), string(v)))
469 .collect(),
470 )
471 }
472
473 fn table(list: &[Value]) -> Value {
474 Value::list(list.to_vec(), Span::test_data())
475 }
476
477 fn content_tag(
478 tag: impl Into<String>,
479 attrs: IndexMap<&str, &str>,
480 content: &[Value],
481 ) -> Value {
482 Value::test_record(record! {
483 COLUMN_TAG_NAME => string(tag),
484 COLUMN_ATTRS_NAME => attributes(attrs),
485 COLUMN_CONTENT_NAME => table(content),
486 })
487 }
488
489 fn content_string(value: impl Into<String>) -> Value {
490 Value::test_record(record! {
491 COLUMN_TAG_NAME => Value::nothing(Span::test_data()),
492 COLUMN_ATTRS_NAME => Value::nothing(Span::test_data()),
493 COLUMN_CONTENT_NAME => string(value),
494 })
495 }
496
497 fn parse(xml: &str) -> Result<Value, roxmltree::Error> {
498 let info = ParsingInfo {
499 span: Span::test_data(),
500 keep_comments: false,
501 keep_processing_instructions: false,
502 allow_dtd: false,
503 };
504 from_xml_string_to_value(xml, &info)
505 }
506
507 #[test]
508 fn parses_empty_element() -> Result<(), roxmltree::Error> {
509 let source = "<nu></nu>";
510
511 assert_eq!(parse(source)?, content_tag("nu", indexmap! {}, &[]));
512
513 Ok(())
514 }
515
516 #[test]
517 fn parses_element_with_text() -> Result<(), roxmltree::Error> {
518 let source = "<nu>La era de los tres caballeros</nu>";
519
520 assert_eq!(
521 parse(source)?,
522 content_tag(
523 "nu",
524 indexmap! {},
525 &[content_string("La era de los tres caballeros")]
526 )
527 );
528
529 Ok(())
530 }
531
532 #[test]
533 fn parses_element_with_elements() -> Result<(), roxmltree::Error> {
534 let source = "\
535<nu>
536 <dev>Andrés</dev>
537 <dev>JT</dev>
538 <dev>Yehuda</dev>
539</nu>";
540
541 assert_eq!(
542 parse(source)?,
543 content_tag(
544 "nu",
545 indexmap! {},
546 &[
547 content_tag("dev", indexmap! {}, &[content_string("Andrés")]),
548 content_tag("dev", indexmap! {}, &[content_string("JT")]),
549 content_tag("dev", indexmap! {}, &[content_string("Yehuda")])
550 ]
551 )
552 );
553
554 Ok(())
555 }
556
557 #[test]
558 fn parses_element_with_attribute() -> Result<(), roxmltree::Error> {
559 let source = "\
560<nu version=\"2.0\">
561</nu>";
562
563 assert_eq!(
564 parse(source)?,
565 content_tag("nu", indexmap! {"version" => "2.0"}, &[])
566 );
567
568 Ok(())
569 }
570
571 #[test]
572 fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> {
573 let source = "\
574<nu version=\"2.0\">
575 <version>2.0</version>
576</nu>";
577
578 assert_eq!(
579 parse(source)?,
580 content_tag(
581 "nu",
582 indexmap! {"version" => "2.0"},
583 &[content_tag(
584 "version",
585 indexmap! {},
586 &[content_string("2.0")]
587 )]
588 )
589 );
590
591 Ok(())
592 }
593
594 #[test]
595 fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> {
596 let source = "\
597<nu version=\"2.0\" age=\"25\">
598</nu>";
599
600 assert_eq!(
601 parse(source)?,
602 content_tag("nu", indexmap! {"version" => "2.0", "age" => "25"}, &[])
603 );
604
605 Ok(())
606 }
607
608 #[test]
609 fn test_examples() -> nu_test_support::Result {
610 nu_test_support::test().examples(FromXml)
611 }
612
613 #[test]
614 fn test_content_type_metadata() {
615 let mut engine_state = Box::new(EngineState::new());
616 let delta = {
617 let mut working_set = StateWorkingSet::new(&engine_state);
618
619 working_set.add_decl(Box::new(FromXml {}));
620 working_set.add_decl(Box::new(Metadata {}));
621 working_set.add_decl(Box::new(MetadataSet {}));
622 working_set.add_decl(Box::new(Reject {}));
623
624 working_set.render()
625 };
626
627 engine_state
628 .merge_delta(delta)
629 .expect("Error merging delta");
630
631 let cmd = r#"'<?xml version="1.0" encoding="UTF-8"?>
632<note>
633 <remember>Event</remember>
634</note>' | metadata set --content-type 'application/xml' --path-columns [name] | from xml | metadata | reject span | $in"#;
635 let result = eval_pipeline_without_terminal_expression(
636 cmd,
637 std::env::temp_dir().as_ref(),
638 &mut engine_state,
639 );
640 assert_eq!(
641 Value::test_record(
642 record!("path_columns" => Value::test_list(vec![Value::test_string("name")]))
643 ),
644 result.expect("There should be a result")
645 )
646 }
647
648 #[test]
649 fn xml_error_source_is_bounded() {
650 let mut input = String::from("<root>");
652 for _ in 0..5000 {
653 input.push_str("<item>value</item>");
654 }
655 input.push_str("<bad"); let signals = Signals::empty();
658 let parse_result = roxmltree::Document::parse_with_options(
659 &input,
660 ParsingOptions {
661 allow_dtd: true,
662 ..Default::default()
663 },
664 );
665 assert!(parse_result.is_err(), "should fail to parse");
666
667 let err = process_xml_parse_error(
668 &input,
669 parse_result.unwrap_err(),
670 Span::test_data(),
671 &signals,
672 );
673 match &err {
674 ShellError::OutsideSpannedLabeledError { src, .. } => {
675 assert!(
676 src.len() < 20_000,
677 "error source should be bounded, got {} bytes",
678 src.len()
679 );
680 }
681 ShellError::Generic(_) => (), other => panic!("expected OutsideSpannedLabeledError or Generic, got {other:?}"),
683 }
684 }
685
686 #[test]
687 fn xml_parse_success_not_affected() {
688 let result = from_xml_string_to_value(
689 r#"<?xml version="1.0"?><root><item>value</item></root>"#,
690 &ParsingInfo {
691 span: Span::test_data(),
692 keep_comments: false,
693 keep_processing_instructions: false,
694 allow_dtd: false,
695 },
696 );
697 assert!(result.is_ok(), "valid XML should still parse");
698 }
699}