1use crate::formats::nu_xml_format::{COLUMN_ATTRS_NAME, COLUMN_CONTENT_NAME, COLUMN_TAG_NAME};
2use indexmap::IndexMap;
3use nu_engine::command_prelude::*;
4use nu_protocol::shell_error::generic::GenericError;
5
6use roxmltree::{NodeType, ParsingOptions, TextPos};
7
8#[derive(Clone)]
9pub struct FromXml;
10
11impl Command for FromXml {
12 fn name(&self) -> &str {
13 "from xml"
14 }
15
16 fn signature(&self) -> Signature {
17 Signature::build("from xml")
18 .input_output_types(vec![(Type::String, Type::record())])
19 .switch("keep-comments", "Add comment nodes to result.", None)
20 .switch(
21 "allow-dtd",
22 "Allow parsing documents with DTDs (may result in exponential entity expansion).",
23 None,
24 )
25 .switch(
26 "keep-pi",
27 "Add processing instruction nodes to result.",
28 None,
29 )
30 .category(Category::Formats)
31 }
32
33 fn description(&self) -> &str {
34 "Parse text as .xml and create record."
35 }
36
37 fn extra_description(&self) -> &str {
38 r#"Every XML entry is represented via a record with tag, attribute and content fields.
39To represent different types of entries different values are written to this fields:
401. Tag entry: `{tag: <tag name> attrs: {<attr name>: "<string value>" ...} content: [<entries>]}`
412. Comment entry: `{tag: '!' attrs: null content: "<comment string>"}`
423. Processing instruction (PI): `{tag: '?<pi name>' attrs: null content: "<pi content string>"}`
434. Text: `{tag: null attrs: null content: "<text>"}`.
44
45Unlike to xml command all null values are always present and text is never represented via plain
46string. This way content of every tag is always a table and is easier to parse"#
47 }
48
49 fn run(
50 &self,
51 engine_state: &EngineState,
52 stack: &mut Stack,
53 call: &Call,
54 input: PipelineData,
55 ) -> Result<PipelineData, ShellError> {
56 let head = call.head;
57 let keep_comments = call.has_flag(engine_state, stack, "keep-comments")?;
58 let keep_processing_instructions = call.has_flag(engine_state, stack, "keep-pi")?;
59 let allow_dtd = call.has_flag(engine_state, stack, "allow-dtd")?;
60 let info = ParsingInfo {
61 span: head,
62 keep_comments,
63 keep_processing_instructions,
64 allow_dtd,
65 };
66 from_xml(input, &info)
67 }
68
69 fn examples(&self) -> Vec<Example<'_>> {
70 vec![Example {
71 example: r#"'<?xml version="1.0" encoding="UTF-8"?>
72<note>
73 <remember>Event</remember>
74</note>' | from xml"#,
75 description: "Converts xml formatted string to record.",
76 result: Some(Value::test_record(record! {
77 COLUMN_TAG_NAME => Value::test_string("note"),
78 COLUMN_ATTRS_NAME => Value::test_record(Record::new()),
79 COLUMN_CONTENT_NAME => Value::test_list(vec![
80 Value::test_record(record! {
81 COLUMN_TAG_NAME => Value::test_string("remember"),
82 COLUMN_ATTRS_NAME => Value::test_record(Record::new()),
83 COLUMN_CONTENT_NAME => Value::test_list(vec![
84 Value::test_record(record! {
85 COLUMN_TAG_NAME => Value::test_nothing(),
86 COLUMN_ATTRS_NAME => Value::test_nothing(),
87 COLUMN_CONTENT_NAME => Value::test_string("Event"),
88 })],
89 ),
90 })],
91 ),
92 })),
93 }]
94 }
95}
96
97struct ParsingInfo {
98 span: Span,
99 keep_comments: bool,
100 keep_processing_instructions: bool,
101 allow_dtd: bool,
102}
103
104fn from_attributes_to_value(attributes: &[roxmltree::Attribute], info: &ParsingInfo) -> Value {
105 let mut collected = IndexMap::new();
106 for a in attributes {
107 collected.insert(String::from(a.name()), Value::string(a.value(), info.span));
108 }
109 Value::record(collected.into_iter().collect(), info.span)
110}
111
112fn element_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Value {
113 let span = info.span;
114 let mut node = IndexMap::new();
115
116 let tag = n.tag_name().name().trim().to_string();
117 let tag = Value::string(tag, span);
118
119 let content: Vec<Value> = n
120 .children()
121 .filter_map(|node| from_node_to_value(&node, info))
122 .collect();
123 let content = Value::list(content, span);
124
125 let attributes = from_attributes_to_value(&n.attributes().collect::<Vec<_>>(), info);
126
127 node.insert(String::from(COLUMN_TAG_NAME), tag);
128 node.insert(String::from(COLUMN_ATTRS_NAME), attributes);
129 node.insert(String::from(COLUMN_CONTENT_NAME), content);
130
131 Value::record(node.into_iter().collect(), span)
132}
133
134fn text_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
135 let span = info.span;
136 let text = n.text().expect("Non-text node supplied to text_to_value");
137 let text = text.trim();
138 if text.is_empty() {
139 None
140 } else {
141 let mut node = IndexMap::new();
142 let content = Value::string(String::from(text), span);
143
144 node.insert(String::from(COLUMN_TAG_NAME), Value::nothing(span));
145 node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
146 node.insert(String::from(COLUMN_CONTENT_NAME), content);
147
148 Some(Value::record(node.into_iter().collect(), span))
149 }
150}
151
152fn comment_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
153 if info.keep_comments {
154 let span = info.span;
155 let text = n
156 .text()
157 .expect("Non-comment node supplied to comment_to_value");
158
159 let mut node = IndexMap::new();
160 let content = Value::string(String::from(text), span);
161
162 node.insert(String::from(COLUMN_TAG_NAME), Value::string("!", span));
163 node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
164 node.insert(String::from(COLUMN_CONTENT_NAME), content);
165
166 Some(Value::record(node.into_iter().collect(), span))
167 } else {
168 None
169 }
170}
171
172fn processing_instruction_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
173 if info.keep_processing_instructions {
174 let span = info.span;
175 let pi = n.pi()?;
176
177 let mut node = IndexMap::new();
178 let tag = format!("?{}", pi.target);
180 let tag = Value::string(tag, span);
181 let content = pi
182 .value
183 .map_or_else(|| Value::nothing(span), |x| Value::string(x, span));
184
185 node.insert(String::from(COLUMN_TAG_NAME), tag);
186 node.insert(String::from(COLUMN_ATTRS_NAME), Value::nothing(span));
187 node.insert(String::from(COLUMN_CONTENT_NAME), content);
188
189 Some(Value::record(node.into_iter().collect(), span))
190 } else {
191 None
192 }
193}
194
195fn from_node_to_value(n: &roxmltree::Node, info: &ParsingInfo) -> Option<Value> {
196 match n.node_type() {
197 NodeType::Element => Some(element_to_value(n, info)),
198 NodeType::Text => text_to_value(n, info),
199 NodeType::Comment => comment_to_value(n, info),
200 NodeType::PI => processing_instruction_to_value(n, info),
201 _ => None,
202 }
203}
204
205fn from_document_to_value(d: &roxmltree::Document, info: &ParsingInfo) -> Value {
206 element_to_value(&d.root_element(), info)
207}
208
209fn from_xml_string_to_value(s: &str, info: &ParsingInfo) -> Result<Value, roxmltree::Error> {
210 let options = ParsingOptions {
211 allow_dtd: info.allow_dtd,
212 ..Default::default()
213 };
214
215 let parsed = roxmltree::Document::parse_with_options(s, options)?;
216 Ok(from_document_to_value(&parsed, info))
217}
218
219fn from_xml(input: PipelineData, info: &ParsingInfo) -> Result<PipelineData, ShellError> {
220 let (concat_string, span, metadata) = input.collect_string_strict(info.span)?;
221
222 match from_xml_string_to_value(&concat_string, info) {
223 Ok(x) => {
224 Ok(x.into_pipeline_data_with_metadata(metadata.map(|md| md.with_content_type(None))))
225 }
226 Err(err) => Err(process_xml_parse_error(concat_string, err, span)),
227 }
228}
229
230fn process_xml_parse_error(source: String, err: roxmltree::Error, span: Span) -> ShellError {
231 match err {
232 roxmltree::Error::InvalidXmlPrefixUri(pos) => make_xml_error_spanned(
233 "The `xmlns:xml` attribute must have an <http://www.w3.org/XML/1998/namespace> URI.",
234 source,
235 pos,
236 ),
237 roxmltree::Error::UnexpectedXmlUri(pos) => make_xml_error_spanned(
238 "Only the xmlns:xml attribute can have the http://www.w3.org/XML/1998/namespace URI.",
239 source,
240 pos,
241 ),
242 roxmltree::Error::UnexpectedXmlnsUri(pos) => make_xml_error_spanned(
243 "The http://www.w3.org/2000/xmlns/ URI must not be declared.",
244 source,
245 pos,
246 ),
247 roxmltree::Error::InvalidElementNamePrefix(pos) => {
248 make_xml_error_spanned("xmlns can't be used as an element prefix.", source, pos)
249 }
250 roxmltree::Error::DuplicatedNamespace(namespace, pos) => make_xml_error_spanned(
251 format!("Namespace {namespace} was already defined on this element."),
252 source,
253 pos,
254 ),
255 roxmltree::Error::UnknownNamespace(prefix, pos) => {
256 make_xml_error_spanned(format!("Unknown prefix {prefix}"), source, pos)
257 }
258 roxmltree::Error::UnexpectedCloseTag(expected, actual, pos) => make_xml_error_spanned(
259 format!("Unexpected close tag {actual}, expected {expected}"),
260 source,
261 pos,
262 ),
263 roxmltree::Error::UnexpectedEntityCloseTag(pos) => {
264 make_xml_error_spanned("Entity value starts with a close tag.", source, pos)
265 }
266 roxmltree::Error::UnknownEntityReference(entity, pos) => make_xml_error_spanned(
267 format!("Reference to unknown entity {entity} (was not defined in the DTD)"),
268 source,
269 pos,
270 ),
271 roxmltree::Error::MalformedEntityReference(pos) => {
272 make_xml_error_spanned("Malformed entity reference.", source, pos)
273 }
274 roxmltree::Error::EntityReferenceLoop(pos) => {
275 make_xml_error_spanned("Possible entity reference loop.", source, pos)
276 }
277 roxmltree::Error::InvalidAttributeValue(pos) => {
278 make_xml_error_spanned("Attribute value cannot have a < character.", source, pos)
279 }
280 roxmltree::Error::DuplicatedAttribute(attribute, pos) => make_xml_error_spanned(
281 format!("Element has a duplicated attribute: {attribute}"),
282 source,
283 pos,
284 ),
285 roxmltree::Error::NoRootNode => {
286 make_xml_error("The XML document must have at least one element.", span)
287 }
288 roxmltree::Error::UnclosedRootNode => {
289 make_xml_error("The root node was opened but never closed.", span)
290 }
291 roxmltree::Error::DtdDetected => make_xml_error(
292 "XML document with DTD detected.\nDTDs are disabled by default to prevent denial-of-service attacks (use `from xml --allow-dtd` to bypass this functionality)",
293 span,
294 ),
295 roxmltree::Error::NodesLimitReached => make_xml_error("Node limit was reached.", span),
296 roxmltree::Error::AttributesLimitReached => make_xml_error("Attribute limit reached", span),
297 roxmltree::Error::NamespacesLimitReached => make_xml_error("Namespace limit reached", span),
298 roxmltree::Error::UnexpectedDeclaration(pos) => make_xml_error_spanned(
299 "An XML document can have only one XML declaration and it must be at the start of the document.",
300 source,
301 pos,
302 ),
303 roxmltree::Error::InvalidName(pos) => make_xml_error_spanned("Invalid name.", source, pos),
304 roxmltree::Error::NonXmlChar(_, pos) => make_xml_error_spanned(
305 "Non-XML character found. Valid characters are: <https://www.w3.org/TR/xml/#char32>",
306 source,
307 pos,
308 ),
309 roxmltree::Error::InvalidChar(expected, actual, pos) => make_xml_error_spanned(
310 format!(
311 "Unexpected character {}, expected {}",
312 actual as char, expected as char
313 ),
314 source,
315 pos,
316 ),
317 roxmltree::Error::InvalidChar2(expected, actual, pos) => make_xml_error_spanned(
318 format!(
319 "Unexpected character {}, expected {}",
320 actual as char, expected
321 ),
322 source,
323 pos,
324 ),
325 roxmltree::Error::InvalidString(_, pos) => {
326 make_xml_error_spanned("Invalid/unexpected string in XML.", source, pos)
327 }
328 roxmltree::Error::InvalidExternalID(pos) => {
329 make_xml_error_spanned("Invalid ExternalID in the DTD.", source, pos)
330 }
331 roxmltree::Error::InvalidComment(pos) => make_xml_error_spanned(
332 "A comment cannot contain `--` or end with `-`.",
333 source,
334 pos,
335 ),
336 roxmltree::Error::InvalidCharacterData(pos) => make_xml_error_spanned(
337 "Character Data node contains an invalid data. Currently, only `]]>` is not allowed.",
338 source,
339 pos,
340 ),
341 roxmltree::Error::UnknownToken(pos) => {
342 make_xml_error_spanned("Unknown token in XML.", source, pos)
343 }
344 roxmltree::Error::UnexpectedEndOfStream => {
345 make_xml_error("Unexpected end of stream while parsing XML.", span)
346 }
347 }
348}
349
350fn make_xml_error(msg: impl Into<String>, span: Span) -> ShellError {
351 ShellError::Generic(GenericError::new("Failed to parse XML", msg.into(), span))
352}
353
354fn make_xml_error_spanned(msg: impl Into<String>, src: String, pos: TextPos) -> ShellError {
355 let span = Span::from_row_column(pos.row as usize, pos.col as usize, &src);
356 ShellError::OutsideSpannedLabeledError {
357 src,
358 error: "Failed to parse XML".into(),
359 msg: msg.into(),
360 span,
361 }
362}
363
364#[cfg(test)]
365mod tests {
366 use crate::Metadata;
367 use crate::MetadataSet;
368 use crate::Reject;
369
370 use super::*;
371
372 use indexmap::IndexMap;
373 use indexmap::indexmap;
374 use nu_cmd_lang::eval_pipeline_without_terminal_expression;
375
376 fn string(input: impl Into<String>) -> Value {
377 Value::test_string(input)
378 }
379
380 fn attributes(entries: IndexMap<&str, &str>) -> Value {
381 Value::test_record(
382 entries
383 .into_iter()
384 .map(|(k, v)| (k.into(), string(v)))
385 .collect(),
386 )
387 }
388
389 fn table(list: &[Value]) -> Value {
390 Value::list(list.to_vec(), Span::test_data())
391 }
392
393 fn content_tag(
394 tag: impl Into<String>,
395 attrs: IndexMap<&str, &str>,
396 content: &[Value],
397 ) -> Value {
398 Value::test_record(record! {
399 COLUMN_TAG_NAME => string(tag),
400 COLUMN_ATTRS_NAME => attributes(attrs),
401 COLUMN_CONTENT_NAME => table(content),
402 })
403 }
404
405 fn content_string(value: impl Into<String>) -> Value {
406 Value::test_record(record! {
407 COLUMN_TAG_NAME => Value::nothing(Span::test_data()),
408 COLUMN_ATTRS_NAME => Value::nothing(Span::test_data()),
409 COLUMN_CONTENT_NAME => string(value),
410 })
411 }
412
413 fn parse(xml: &str) -> Result<Value, roxmltree::Error> {
414 let info = ParsingInfo {
415 span: Span::test_data(),
416 keep_comments: false,
417 keep_processing_instructions: false,
418 allow_dtd: false,
419 };
420 from_xml_string_to_value(xml, &info)
421 }
422
423 #[test]
424 fn parses_empty_element() -> Result<(), roxmltree::Error> {
425 let source = "<nu></nu>";
426
427 assert_eq!(parse(source)?, content_tag("nu", indexmap! {}, &[]));
428
429 Ok(())
430 }
431
432 #[test]
433 fn parses_element_with_text() -> Result<(), roxmltree::Error> {
434 let source = "<nu>La era de los tres caballeros</nu>";
435
436 assert_eq!(
437 parse(source)?,
438 content_tag(
439 "nu",
440 indexmap! {},
441 &[content_string("La era de los tres caballeros")]
442 )
443 );
444
445 Ok(())
446 }
447
448 #[test]
449 fn parses_element_with_elements() -> Result<(), roxmltree::Error> {
450 let source = "\
451<nu>
452 <dev>Andrés</dev>
453 <dev>JT</dev>
454 <dev>Yehuda</dev>
455</nu>";
456
457 assert_eq!(
458 parse(source)?,
459 content_tag(
460 "nu",
461 indexmap! {},
462 &[
463 content_tag("dev", indexmap! {}, &[content_string("Andrés")]),
464 content_tag("dev", indexmap! {}, &[content_string("JT")]),
465 content_tag("dev", indexmap! {}, &[content_string("Yehuda")])
466 ]
467 )
468 );
469
470 Ok(())
471 }
472
473 #[test]
474 fn parses_element_with_attribute() -> Result<(), roxmltree::Error> {
475 let source = "\
476<nu version=\"2.0\">
477</nu>";
478
479 assert_eq!(
480 parse(source)?,
481 content_tag("nu", indexmap! {"version" => "2.0"}, &[])
482 );
483
484 Ok(())
485 }
486
487 #[test]
488 fn parses_element_with_attribute_and_element() -> Result<(), roxmltree::Error> {
489 let source = "\
490<nu version=\"2.0\">
491 <version>2.0</version>
492</nu>";
493
494 assert_eq!(
495 parse(source)?,
496 content_tag(
497 "nu",
498 indexmap! {"version" => "2.0"},
499 &[content_tag(
500 "version",
501 indexmap! {},
502 &[content_string("2.0")]
503 )]
504 )
505 );
506
507 Ok(())
508 }
509
510 #[test]
511 fn parses_element_with_multiple_attributes() -> Result<(), roxmltree::Error> {
512 let source = "\
513<nu version=\"2.0\" age=\"25\">
514</nu>";
515
516 assert_eq!(
517 parse(source)?,
518 content_tag("nu", indexmap! {"version" => "2.0", "age" => "25"}, &[])
519 );
520
521 Ok(())
522 }
523
524 #[test]
525 fn test_examples() -> nu_test_support::Result {
526 nu_test_support::test().examples(FromXml)
527 }
528
529 #[test]
530 fn test_content_type_metadata() {
531 let mut engine_state = Box::new(EngineState::new());
532 let delta = {
533 let mut working_set = StateWorkingSet::new(&engine_state);
534
535 working_set.add_decl(Box::new(FromXml {}));
536 working_set.add_decl(Box::new(Metadata {}));
537 working_set.add_decl(Box::new(MetadataSet {}));
538 working_set.add_decl(Box::new(Reject {}));
539
540 working_set.render()
541 };
542
543 engine_state
544 .merge_delta(delta)
545 .expect("Error merging delta");
546
547 let cmd = r#"'<?xml version="1.0" encoding="UTF-8"?>
548<note>
549 <remember>Event</remember>
550</note>' | metadata set --content-type 'application/xml' --path-columns [name] | from xml | metadata | reject span | $in"#;
551 let result = eval_pipeline_without_terminal_expression(
552 cmd,
553 std::env::temp_dir().as_ref(),
554 &mut engine_state,
555 );
556 assert_eq!(
557 Value::test_record(
558 record!("path_columns" => Value::test_list(vec![Value::test_string("name")]))
559 ),
560 result.expect("There should be a result")
561 )
562 }
563}