xmltojson/
lib.rs

1/*
2 * This library helps convert an XML String into a serde_json::Value which can be
3 * used to generate JSON
4 */
5
6#[cfg(test)]
7#[macro_use]
8extern crate serde_json;
9
10use log::*;
11use quick_xml::escape::resolve_predefined_entity;
12use quick_xml::events::Event;
13use quick_xml::Reader;
14use serde_json::{to_value, Map, Value};
15use std::io::BufRead;
16use std::mem::take;
17
18#[derive(Debug)]
19pub struct Error {}
20
21trait AttrMap {
22    fn insert_text(&mut self, value: &Value) -> Option<Value>;
23    fn insert_text_node(&mut self, value: Value);
24}
25
26impl AttrMap for Map<String, Value> {
27    fn insert_text(&mut self, value: &Value) -> Option<Value> {
28        if !self.is_empty() {
29            if value.is_string() {
30                self.insert_text_node(value.clone());
31            }
32            if let Ok(attrs) = to_value(take(self)) {
33                return Some(attrs);
34            }
35        }
36        None
37    }
38
39    fn insert_text_node(&mut self, value: Value) {
40        self.insert("#text".to_string(), value);
41    }
42}
43
44struct NodeValues {
45    node: Map<String, Value>,
46    nodes: Vec<Map<String, Value>>,
47    nodes_are_map: Vec<bool>,
48    values: Vec<Value>,
49}
50
51impl NodeValues {
52    fn new() -> Self {
53        Self {
54            values: Vec::new(),
55            node: Map::new(),
56            nodes: Vec::new(),
57            nodes_are_map: Vec::new(),
58        }
59    }
60
61    fn insert(&mut self, key: String, value: Value) {
62        self.node.insert(key, value);
63    }
64
65    fn insert_cdata(&mut self, value: &str) {
66        let key = "#cdata".to_string();
67        let new_value = match self.node.get(&key) {
68            Some(existing) => {
69                let mut old_value = existing.as_str().unwrap().to_string();
70                old_value.push_str(value);
71                old_value
72            }
73            None => value.to_string(),
74        };
75        self.node.insert(key, Value::String(new_value));
76    }
77
78    fn insert_text(&mut self, text: &str) {
79        if self.node.is_empty() {
80            // if directly preceded by another string, append to it
81            if let Some(value) = self.values.pop() {
82                let mut value_text = value.as_str().unwrap_or_default().to_string();
83                value_text.push_str(text);
84                self.values.push(Value::String(value_text));
85                return;
86            }
87        } else {
88            // don't insert whitespace between nodes
89            if text.trim().is_empty() {
90                return;
91            }
92
93            self.nodes.push(take(&mut self.node));
94            self.nodes_are_map.push(true);
95        }
96
97        self.values.push(Value::String(text.to_string()));
98        self.nodes_are_map.push(false);
99    }
100
101    fn remove_entry(&mut self, key: &String) -> Option<Value> {
102        if self.node.contains_key(key) {
103            debug!("Node contains `{}` already, need to convert to array", key);
104            if let Some((_, existing)) = self.node.remove_entry(key) {
105                return Some(existing);
106            }
107        }
108        None
109    }
110
111    fn get_value(&mut self) -> Value {
112        debug!("values to return: {:?}", self.values);
113        if !self.node.is_empty() {
114            self.nodes.push(take(&mut self.node));
115            self.nodes_are_map.push(true);
116        }
117
118        if !self.nodes.is_empty() {
119            // If we had collected some non-whitespace text along the way, that
120            // needs to be inserted so we don't lose it
121
122            if self.nodes.len() == 1 && self.values.len() <= 1 {
123                if self.values.len() == 1 {
124                    let value = self.values.remove(0);
125                    let text = value.as_str().unwrap_or_default().trim();
126                    if !text.is_empty() {
127                        self.nodes[0].insert_text_node(Value::String(text.to_string()));
128                    }
129                }
130                debug!("returning node instead: {:?}", self.nodes[0]);
131                return to_value(&self.nodes[0]).expect("Failed to #to_value() a node!");
132            }
133            for (index, node_is_map) in self.nodes_are_map.iter().enumerate() {
134                if *node_is_map {
135                    self.values
136                        .insert(index, Value::Object(self.nodes.remove(0)));
137                }
138            }
139        }
140
141        // trim any values left, removing empty strings
142        self.values = self
143            .values
144            .clone()
145            .into_iter()
146            .filter_map(|value| {
147                if value.is_string() {
148                    let trimmed = value.as_str().unwrap_or_default().trim();
149                    if trimmed.is_empty() {
150                        return None;
151                    }
152                    return Some(Value::String(trimmed.to_string()));
153                }
154                Some(value)
155            })
156            .collect();
157
158        match self.values.len() {
159            0 => Value::Null,
160            1 => self.values.pop().unwrap(),
161            _ => Value::Array(take(&mut self.values)),
162        }
163    }
164}
165
166pub fn read<R: BufRead>(reader: &mut Reader<R>, depth: u64) -> Value {
167    let mut buf = Vec::new();
168    let mut nodes = NodeValues::new();
169    debug!("Parsing at depth: {}", depth);
170
171    loop {
172        match reader.read_event_into(&mut buf) {
173            Ok(Event::Start(ref e)) => {
174                if let Ok(name) = String::from_utf8(e.name().into_inner().to_vec()) {
175                    let mut child = read(reader, depth + 1);
176                    let mut attrs = Map::new();
177                    debug!("{} children: {:?}", name, child);
178
179                    let _ = e
180                        .attributes()
181                        .map(|a| {
182                            if let Ok(attr) = a {
183                                let key = String::from_utf8(attr.key.into_inner().to_vec());
184                                let value = String::from_utf8(attr.value.to_vec());
185
186                                // Only bother adding the attribute if both key and value are valid utf8
187                                if let (Ok(key), Ok(value)) = (key, value) {
188                                    let key = format!("@{}", key);
189                                    let value = Value::String(value);
190
191                                    // If the child is already an object, that's where the insert
192                                    // should happen
193                                    if child.is_object() {
194                                        child.as_object_mut().unwrap().insert(key, value);
195                                    } else {
196                                        attrs.insert(key, value);
197                                    }
198                                }
199                            }
200                        })
201                        .collect::<Vec<_>>();
202
203                    if let Some(mut existing) = nodes.remove_entry(&name) {
204                        let mut entries: Vec<Value> = vec![];
205
206                        if existing.is_array() {
207                            let existing = existing.as_array_mut().unwrap();
208                            while !existing.is_empty() {
209                                entries.push(existing.remove(0));
210                            }
211                        } else {
212                            entries.push(existing);
213                        }
214
215                        /*
216                         * nodes with attributes need to be handled special
217                         */
218                        if let Some(attrs) = attrs.insert_text(&child) {
219                            entries.push(attrs);
220                        } else {
221                            entries.push(child);
222                        }
223
224                        nodes.insert(name, Value::Array(entries));
225                    /*
226                     * nodes with attributes need to be handled special
227                     */
228                    } else if let Some(attrs) = attrs.insert_text(&child) {
229                        nodes.insert(name, attrs);
230                    } else {
231                        nodes.insert(name, child);
232                    }
233                }
234            }
235            Ok(Event::Text(ref e)) => {
236                if let Ok(decoded) = e.decode() {
237                    nodes.insert_text(&decoded);
238                }
239            }
240            Ok(Event::CData(ref e)) => {
241                if let Ok(decoded) = e.decode() {
242                    nodes.insert_cdata(&decoded);
243                }
244            }
245            Ok(Event::GeneralRef(ref e)) => {
246                if let Ok(Some(ch)) = e.resolve_char_ref() {
247                    nodes.insert_text(&ch.to_string());
248                } else if let Ok(decoded) = e.decode() {
249                    if let Some(entity) = resolve_predefined_entity(&decoded) {
250                        nodes.insert_text(entity);
251                    }
252                }
253            }
254            Ok(Event::End(ref _e)) => break,
255            Ok(Event::Eof) => break,
256            _ => (),
257        }
258    }
259    nodes.get_value()
260}
261
262/**
263 * to_json() will take an input string and attempt to convert it into a form
264 * of JSON
265 */
266pub fn to_json(xml: &str) -> Result<Value, Error> {
267    let mut reader = Reader::from_str(xml);
268    let config = reader.config_mut();
269    config.expand_empty_elements = true;
270    // when trimming at the config level, we'd loose spaces between escaped entities
271
272    Ok(read(&mut reader, 0))
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278
279    fn json_eq(left: Value, right: Result<Value, Error>) {
280        assert!(right.is_ok());
281        assert_eq!(left, right.unwrap());
282    }
283
284    #[test]
285    fn single_node() {
286        json_eq(json!({ "e": null }), to_json("<e></e>"));
287    }
288
289    #[test]
290    fn node_with_text() {
291        json_eq(json!({"e" : "foo"}), to_json("<e>foo</e>"));
292    }
293
294    #[test]
295    fn node_with_attr() {
296        json_eq(
297            json!({"e" : {"@name":"value"}}),
298            to_json("<e name=\"value\"></e>"),
299        );
300    }
301
302    #[test]
303    fn node_with_attr_and_text() {
304        json_eq(
305            json!({"e": {"@name":"value", "#text" : "text"}}),
306            to_json(r#"<e name="value">text</e>"#),
307        );
308    }
309
310    #[test]
311    fn node_with_children() {
312        json_eq(
313            json!(
314            {
315            "e":{
316                "a":"text1",
317                "b":"text2"
318            }
319            }),
320            to_json(r#"<e> <a>text1</a> <b>text2</b> </e>"#),
321        );
322    }
323
324    #[test]
325    fn node_with_multiple_identical_children() {
326        json_eq(
327            json!({
328            "e":{"a":[
329                "text",
330                "text"
331                ]}
332            }),
333            to_json(r#"<e><a>text</a><a>text</a></e>"#),
334        );
335    }
336
337    #[test]
338    fn node_with_n_identical_children() {
339        json_eq(
340            json!({
341            "e":{"a":[
342                "text1",
343                "text2",
344                "text3"
345                ]}
346            }),
347            to_json(r#"<e><a>text1</a><a>text2</a><a>text3</a></e>"#),
348        );
349    }
350
351    #[test]
352    fn node_with_text_and_child() {
353        json_eq(
354            json!(
355            {
356            "e":{
357                "#text":"lol",
358                "a":"text"
359            }
360            }),
361            to_json(r#"<e> lol <a>text</a></e>"#),
362        );
363    }
364
365    #[test]
366    fn node_with_just_text() {
367        json_eq(
368            json!(
369            {
370            "a":"hello"
371            }),
372            to_json(r#"<a>hello</a>"#),
373        );
374    }
375
376    #[test]
377    fn node_with_attrs_and_text() {
378        json_eq(
379            json!(
380            {
381                "a":{
382                    "@x":"y",
383                    "#text":"hello"
384                }
385            }),
386            to_json(r#"<a x="y">hello</a>"#),
387        );
388    }
389
390    #[test]
391    fn nested_nodes_with_attrs() {
392        json_eq(
393            json!(
394            {
395                "a":{
396                    "@id":"a",
397                    "b":{
398                        "@id":"b",
399                        "#text":"hey!"
400                    }
401                }
402            }),
403            to_json(r#"<a id="a"><b id="b">hey!</b></a>"#),
404        );
405    }
406
407    #[test]
408    fn node_with_nested_text() {
409        json_eq(
410            json!(
411            {
412                "a":["x",{"c":null},"y"]
413            }),
414            to_json(r#"<a>x<c/>y</a>"#),
415        );
416    }
417
418    #[test]
419    fn node_with_empty_attrs() {
420        json_eq(
421            json!(
422            {
423            "x":{"@u":""}
424            }),
425            to_json(r#"<x u=""/>"#),
426        );
427    }
428
429    #[test]
430    fn some_basic_html() {
431        json_eq(
432            json!(
433            {
434            "html":{
435                "head":{
436                "title":"Xml/Json",
437                "meta":{
438                    "@name":"x",
439                    "@content":"y"
440                }
441                },
442                "body":null
443            }
444            }),
445            to_json(
446                r#"<html><head><title>Xml/Json</title><meta name="x" content="y"/></head><body/></html>"#,
447            ),
448        );
449    }
450
451    #[test]
452    fn more_complex_html() {
453        json_eq(
454            json!(
455            {
456                "ol":{
457                    "@class":"xoxo",
458                    "li":[
459                    {
460                        "#text":"Subject 1",
461                        "ol":{"li":[
462                            "subpoint a",
463                            "subpoint b"
464                        ]}
465                    },
466                    {
467                        "span":"Subject 2",
468                        "ol":{
469                        "@compact":"compact",
470                        "li":[
471                            "subpoint c",
472                            "subpoint d"
473                        ]
474                        }
475                    }
476                    ]
477                }
478            }),
479            to_json(
480                r#"<ol class="xoxo"><li>Subject 1     <ol><li>subpoint a</li><li>subpoint b</li></ol></li><li><span>Subject 2</span><ol compact="compact"><li>subpoint c</li><li>subpoint d</li></ol></li></ol>"#,
481            ),
482        );
483    }
484
485    #[test]
486    fn node_with_cdata() {
487        json_eq(
488            json!(
489            {
490            "e":{"#cdata":" .. some data .. "}
491            }),
492            to_json(r#"<e><![CDATA[ .. some data .. ]]></e>"#),
493        );
494    }
495
496    #[test]
497    fn node_with_cdata_and_siblings() {
498        json_eq(
499            json!(
500            {
501            "e":{
502                "a":null,
503                "#cdata":" .. some data .. ",
504                "b":null
505            }
506            }),
507            to_json(r#"<e><a/><![CDATA[ .. some data .. ]]><b/></e>"#),
508        );
509    }
510
511    #[test]
512    fn node_with_cdata_inside_text() {
513        json_eq(
514            json!(
515            {
516            "e":["some text",{"#cdata":" .. some data .. "}, "more text"]
517            }),
518            to_json(r#"<e>  some text  <![CDATA[ .. some data .. ]]>  more text</e>"#),
519        );
520    }
521
522    #[test]
523    fn node_with_child_cdata_and_text() {
524        json_eq(
525            json!(
526            {
527            "e":{
528                "#text":"some text",
529                "#cdata":" .. some data .. ",
530                "a":null
531            }
532            }),
533            to_json(r#"<e>  some text  <![CDATA[ .. some data .. ]]><a/></e>"#),
534        );
535    }
536
537    #[test]
538    fn node_with_duplicate_cdata() {
539        json_eq(
540            json!(
541            {
542            "e":{
543                "#cdata":" .. some data ..  .. more data .. ",
544            }
545            }),
546            to_json(r#"<e><![CDATA[ .. some data .. ]]><![CDATA[ .. more data .. ]]></e>"#),
547        );
548    }
549
550    #[test]
551    fn node_empty() {
552        json_eq(json!(null), to_json(""));
553    }
554
555    #[test]
556    fn node_with_duplicate_text() {
557        json_eq(
558            json!({"e": {"a": ["x", "y"]}}),
559            to_json("<e><a>x</a><a>y</a></e>"),
560        );
561    }
562
563    #[test]
564    fn node_with_duplicate_attrs_and_text() {
565        json_eq(
566            json!({"e": {"a": [{"#text": "x", "@u": "x"}, {"#text": "y", "@u": "y"}]}}),
567            to_json(r#"<e><a u="x">x</a><a u="y">y</a></e>"#),
568        );
569    }
570
571    #[test]
572    fn node_with_text_and_siblings() {
573        json_eq(
574            json!({"e":["x", {"a": {"@u": "y"}}, "z"]}),
575            to_json(r#"<e>x <a u="y"/> z</e>"#),
576        );
577    }
578
579    #[test]
580    fn node_with_text_and_siblings_mixed() {
581        json_eq(
582            json!({"e":["a", {"x": "b"}, "c", {"x": "d"}]}),
583            to_json(r#"<e>a <x>b</x> c <x>d</x></e>"#),
584        );
585    }
586
587    #[test]
588    fn node_with_cdata_only() {
589        json_eq(
590            json!(
591            {
592            "#cdata":" .. some data .. "
593            }),
594            to_json(r#"<![CDATA[ .. some data .. ]]>"#),
595        );
596    }
597
598    #[test]
599    fn node_with_entities() {
600        json_eq(
601            json!({"pets": "A cat & a dog"}),
602            to_json(r#"<pets>A cat &amp; a dog</pets>"#),
603        );
604    }
605}