quickxml_to_serde/
lib.rs

1#![allow(clippy::items_after_test_module)]
2#![allow(clippy::single_match)]
3#![allow(clippy::single_char_pattern)]
4#![allow(clippy::needless_borrow)]
5#![allow(clippy::ptr_arg)]
6//! # quickxml_to_serde
7//! Fast and flexible conversion from XML to JSON using [quick-xml](https://github.com/tafia/quick-xml)
8//! and [serde](https://github.com/serde-rs/json). Inspired by [node2object](https://github.com/vorot93/node2object).
9//!
10//! This crate converts XML elements, attributes and text nodes directly into corresponding JSON structures.
11//! Some common usage scenarios would be converting XML into JSON for loading into No-SQL databases
12//! or sending it to the front end application.
13//!
14//! Because of the richness and flexibility of XML some conversion behavior is configurable:
15//! - attribute name prefixes
16//! - naming of text nodes
17//! - number format conversion
18//!
19//! ## Usage example
20//! ```
21//! extern crate quickxml_to_serde;
22//! use quickxml_to_serde::{xml_string_to_json, Config, NullValue};
23//!
24//! fn main() {
25//!    let xml = r#"<a attr1="1"><b><c attr2="001">some text</c></b></a>"#;
26//!    let conf = Config::new_with_defaults();
27//!    let json = xml_string_to_json(xml.to_owned(), &conf);
28//!    println!("{}", json.expect("Malformed XML").to_string());
29//!
30//!    let conf = Config::new_with_custom_values(true, "", "txt", NullValue::Null);
31//!    let json = xml_string_to_json(xml.to_owned(), &conf);
32//!    println!("{}", json.expect("Malformed XML").to_string());
33//! }
34//! ```
35//! * **Output with the default config:** `{"a":{"@attr1":1,"b":{"c":{"#text":"some text","@attr2":1}}}}`
36//! * **Output with a custom config:** `{"a":{"attr1":1,"b":{"c":{"attr2":"001","txt":"some text"}}}}`
37//!
38//! ## Additional features
39//! Use `quickxml_to_serde = { version = "0.4", features = ["json_types"] }` to enable support for enforcing JSON types
40//! for some XML nodes using xPath-like notations. Example for enforcing attribute `attr2` from the snippet above
41//! as JSON String regardless of its contents:
42//! ```
43//! use quickxml_to_serde::{Config, JsonArray, JsonType};
44//!
45//! #[cfg(feature = "json_types")]
46//! let conf = Config::new_with_defaults()
47//!            .add_json_type_override("/a/b/c/@attr2", JsonArray::Infer(JsonType::AlwaysString));
48//! ```
49//!
50//! ## Detailed documentation
51//! See [README](https://github.com/AlecTroemel/quickxml_to_serde) in the source repo for more examples, limitations and detailed behavior description.
52//!
53//! ## Testing your XML files
54//!
55//! If you want to see how your XML files are converted into JSON, place them into `./test_xml_files` directory
56//! and run `cargo test`. They will be converted into JSON and saved in the saved directory.
57
58extern crate minidom;
59extern crate serde_json;
60
61#[cfg(feature = "regex_path")]
62extern crate regex;
63
64use minidom::{Element, Error};
65use serde_json::{Map, Number, Value};
66#[cfg(feature = "json_types")]
67use std::collections::HashMap;
68use std::str::FromStr;
69
70#[cfg(feature = "regex_path")]
71use regex::Regex;
72
73#[cfg(test)]
74mod tests;
75
76/// Defines how empty elements like `<x />` should be handled.
77/// `Ignore` -> exclude from JSON, `Null` -> `"x":null`, EmptyObject -> `"x":{}`.
78/// `EmptyObject` is the default option and is how it was handled prior to v.0.4
79/// Using `Ignore` on an XML document with an empty root element falls back to `Null` option.
80/// E.g. both `<a><x/></a>` and `<a/>` are converted into `{"a":null}`.
81#[derive(Debug)]
82pub enum NullValue {
83    Ignore,
84    Null,
85    EmptyObject,
86}
87
88/// Defines how the values of this Node should be converted into a JSON array with the underlying types.
89/// * `Infer` - the nodes are converted into a JSON array only if there are multiple identical elements.
90/// E.g. `<a><b>1</b></a>` becomes a map `{"a": {"b": 1 }}` and `<a><b>1</b><b>2</b><b>3</b></a>` becomes
91/// an array `{"a": {"b": [1, 2, 3] }}`
92/// * `Always` - the nodes are converted into a JSON array regardless of how many there are.
93/// E.g. `<a><b>1</b></a>` becomes an array with a single value `{"a": {"b": [1] }}` and
94/// `<a><b>1</b><b>2</b><b>3</b></a>` also becomes an array `{"a": {"b": [1, 2, 3] }}`
95#[derive(Debug)]
96pub enum JsonArray {
97    /// Convert the nodes into a JSON array even if there is only one element
98    Always(JsonType),
99    /// Convert the nodes into a JSON array only if there are multiple identical elements
100    Infer(JsonType),
101}
102
103/// Used as a parameter for `Config.add_json_type_override`. Defines how the XML path should be matched
104/// in order to apply the JSON type overriding rules. This enumerator exists to allow the same function
105/// to be used for multiple different types of path matching rules.
106#[derive(Debug)]
107pub enum PathMatcher {
108    /// An absolute path starting with a leading slash (`/`). E.g. `/a/b/c/@d`.
109    /// It's implicitly converted from `&str` and automatically includes the leading slash.
110    Absolute(String),
111    /// A regex that will be checked against the XML path. E.g. `(\w/)*c$`.
112    /// It's implicitly converted from `regex::Regex`.
113    #[cfg(feature = "regex_path")]
114    Regex(Regex),
115}
116
117// For retro-compatibility and for syntax's sake, a string may be coerced into an absolute path.
118impl From<&str> for PathMatcher {
119    fn from(value: &str) -> Self {
120        let path_with_leading_slash = if value.starts_with("/") {
121            value.into()
122        } else {
123            ["/", value].concat()
124        };
125
126        PathMatcher::Absolute(path_with_leading_slash)
127    }
128}
129
130// ... While a Regex may be coerced into a regex path.
131#[cfg(feature = "regex_path")]
132impl From<Regex> for PathMatcher {
133    fn from(value: Regex) -> Self {
134        PathMatcher::Regex(value)
135    }
136}
137
138/// Defines which data type to apply in JSON format for consistency of output.
139/// E.g., the range of XML values for the same node type may be `1234`, `001234`, `AB1234`.
140/// It is impossible to guess with 100% consistency which data type to apply without seeing
141/// the entire range of values. Use this enum to tell the converter which data type should
142/// be applied.
143#[derive(Debug, PartialEq, Clone)]
144pub enum JsonType {
145    /// Do not try to infer the type and convert the value to JSON string.
146    /// E.g. convert `<a>1234</a>` into `{"a":"1234"}` or `<a>true</a>` into `{"a":"true"}`
147    AlwaysString,
148    /// Convert values included in this member into JSON bool `true` and any other value into `false`.
149    /// E.g. `Bool(vec!["True", "true", "TRUE"]) will result in any of these values to become JSON bool `true`.
150    Bool(Vec<&'static str>),
151    /// Attempt to infer the type by looking at the single value of the node being converted.
152    /// Not guaranteed to be consistent across multiple nodes.
153    /// E.g. convert `<a>1234</a>` and `<a>001234</a>` into `{"a":1234}`, or `<a>true</a>` into `{"a":true}`
154    /// Check if your values comply with JSON data types (case, range, format) to produce the expected result.
155    Infer,
156}
157
158/// Tells the converter how to perform certain conversions.
159/// See docs for individual fields for more info.
160#[derive(Debug)]
161pub struct Config {
162    /// Numeric values starting with 0 will be treated as strings.
163    /// E.g. convert `<agent>007</agent>` into `"agent":"007"` or `"agent":7`
164    /// Defaults to `false`.
165    pub leading_zero_as_string: bool,
166    /// Prefix XML attribute names with this value to distinguish them from XML elements.
167    /// E.g. set it to `@` for `<x a="Hello!" />` to become `{"x": {"@a":"Hello!"}}`
168    /// or set it to a blank string for `{"x": {"a":"Hello!"}}`
169    /// Defaults to `@`.
170    pub xml_attr_prefix: String,
171    /// A property name for XML text nodes.
172    /// E.g. set it to `text` for `<x a="Hello!">Goodbye!</x>` to become `{"x": {"@a":"Hello!", "text":"Goodbye!"}}`
173    /// XML nodes with text only and no attributes or no child elements are converted into JSON properties with the
174    /// name of the element. E.g. `<x>Goodbye!</x>` becomes `{"x":"Goodbye!"}`
175    /// Defaults to `#text`
176    pub xml_text_node_prop_name: String,
177    /// Defines how empty elements like `<x />` should be handled.
178    pub empty_element_handling: NullValue,
179    /// A map of XML paths with their JsonArray overrides. They take precedence over the document-wide `json_type`
180    /// property. The path syntax is based on xPath: literal element names and attribute names prefixed with `@`.
181    /// The path must start with a leading `/`. It is a bit of an inconvenience to remember about it, but it saves
182    /// an extra `if`-check in the code to improve the performance.
183    /// # Example
184    /// - **XML**: `<a><b c="123">007</b></a>`
185    /// - path for `c`: `/a/b/@c`
186    /// - path for `b` text node (007): `/a/b`
187    #[cfg(feature = "json_types")]
188    pub json_type_overrides: HashMap<String, JsonArray>,
189    /// A list of pairs of regex and JsonArray overrides. They take precedence over both the document-wide `json_type`
190    /// property and the `json_type_overrides` property. The path syntax is based on xPath just like `json_type_overrides`.
191    #[cfg(feature = "regex_path")]
192    pub json_regex_type_overrides: Vec<(Regex, JsonArray)>,
193}
194
195impl Config {
196    /// Numbers with leading zero will be treated as numbers.
197    /// Prefix XML Attribute names with `@`
198    /// Name XML text nodes `#text` for XML Elements with other children
199    pub fn new_with_defaults() -> Self {
200        Config {
201            leading_zero_as_string: false,
202            xml_attr_prefix: "@".to_owned(),
203            xml_text_node_prop_name: "#text".to_owned(),
204            empty_element_handling: NullValue::EmptyObject,
205            #[cfg(feature = "json_types")]
206            json_type_overrides: HashMap::new(),
207            #[cfg(feature = "regex_path")]
208            json_regex_type_overrides: Vec::new(),
209        }
210    }
211
212    /// Create a Config object with non-default values. See the `Config` struct docs for more info.
213    pub fn new_with_custom_values(
214        leading_zero_as_string: bool,
215        xml_attr_prefix: &str,
216        xml_text_node_prop_name: &str,
217        empty_element_handling: NullValue,
218    ) -> Self {
219        Config {
220            leading_zero_as_string,
221            xml_attr_prefix: xml_attr_prefix.to_owned(),
222            xml_text_node_prop_name: xml_text_node_prop_name.to_owned(),
223            empty_element_handling,
224            #[cfg(feature = "json_types")]
225            json_type_overrides: HashMap::new(),
226            #[cfg(feature = "regex_path")]
227            json_regex_type_overrides: Vec::new(),
228        }
229    }
230
231    /// Adds a single JSON Type override rule to the current config.
232    /// # Example
233    /// - **XML**: `<a><b c="123">007</b></a>`
234    /// - path for `c`: `/a/b/@c`
235    /// - path for `b` text node (007): `/a/b`
236    /// - regex path for any `element` node: `(\w/)*element$` [requires `regex_path` feature]
237    #[cfg(feature = "json_types")]
238    pub fn add_json_type_override<P>(self, path: P, json_type: JsonArray) -> Self
239    where
240        P: Into<PathMatcher>
241    {
242        let mut conf = self;
243
244        match path.into() {
245            PathMatcher::Absolute(path) => {
246                conf.json_type_overrides.insert(path, json_type);
247            }
248            #[cfg(feature = "regex_path")]
249            PathMatcher::Regex(regex) => {
250                conf.json_regex_type_overrides.push((
251                    regex,
252                    json_type
253                ));
254            }
255        }
256
257        conf
258    }
259}
260
261impl Default for Config {
262    fn default() -> Self {
263        Config::new_with_defaults()
264    }
265}
266
267/// Returns the text as one of `serde::Value` types: int, float, bool or string.
268fn parse_text(text: &str, leading_zero_as_string: bool, json_type: &JsonType) -> Value {
269    let text = text.trim();
270
271    // enforce JSON String data type regardless of the underlying type
272    if json_type == &JsonType::AlwaysString {
273        return Value::String(text.into());
274    }
275
276    // enforce JSON Bool data type
277    #[cfg(feature = "json_types")]
278    if let JsonType::Bool(true_values) = json_type {
279        if true_values.contains(&text) {
280            // any values matching the `true` list are bool/true
281            return Value::Bool(true);
282        } else {
283            // anything else is false
284            return Value::Bool(false);
285        }
286    }
287
288    // ints
289    if let Ok(v) = text.parse::<u64>() {
290        // don't parse octal numbers and those with leading 0
291        // `text` value "0" will always be converted into number 0, "0000" may be converted
292        // into 0 or "0000" depending on `leading_zero_as_string`
293        if leading_zero_as_string && text.starts_with("0") && (v != 0 || text.len() > 1) {
294            return Value::String(text.into());
295        }
296        return Value::Number(Number::from(v));
297    }
298
299    // floats
300    if let Ok(v) = text.parse::<f64>() {
301        if text.starts_with("0") && !text.starts_with("0.") {
302            return Value::String(text.into());
303        }
304        if let Some(val) = Number::from_f64(v) {
305            return Value::Number(val);
306        }
307    }
308
309    // booleans
310    if let Ok(v) = text.parse::<bool>() {
311        return Value::Bool(v);
312    }
313
314    Value::String(text.into())
315}
316
317/// Converts an XML Element into a JSON property
318fn convert_node(el: &Element, config: &Config, path: &String) -> Option<Value> {
319    // add the current node to the path
320    #[cfg(feature = "json_types")]
321    let path = [path, "/", el.name()].concat();
322
323    // get the json_type for this node
324    let (_, json_type_value) = get_json_type(config, &path);
325
326    // is it an element with text?
327    if el.text().trim() != "" {
328        // process node's attributes, if present
329        if el.attrs().count() > 0 {
330            Some(Value::Object(
331                el.attrs()
332                    .map(|(k, v)| {
333                        // add the current node to the path
334                        #[cfg(feature = "json_types")]
335                        let path = [path.clone(), "/@".to_owned(), k.to_owned()].concat();
336                        // get the json_type for this node
337                        #[cfg(feature = "json_types")]
338                        let (_, json_type_value) = get_json_type(config, &path);
339                        (
340                            [config.xml_attr_prefix.clone(), k.to_owned()].concat(),
341                            parse_text(&v, config.leading_zero_as_string, &json_type_value),
342                        )
343                    })
344                    .chain(vec![(
345                        config.xml_text_node_prop_name.clone(),
346                        parse_text(
347                            &el.text()[..],
348                            config.leading_zero_as_string,
349                            &json_type_value,
350                        ),
351                    )])
352                    .collect(),
353            ))
354        } else {
355            Some(parse_text(
356                &el.text()[..],
357                config.leading_zero_as_string,
358                &json_type_value,
359            ))
360        }
361    } else {
362        // this element has no text, but may have other child nodes
363        let mut data = Map::new();
364
365        for (k, v) in el.attrs() {
366            // add the current node to the path
367            #[cfg(feature = "json_types")]
368            let path = [path.clone(), "/@".to_owned(), k.to_owned()].concat();
369            // get the json_type for this node
370            #[cfg(feature = "json_types")]
371            let (_, json_type_value) = get_json_type(config, &path);
372            data.insert(
373                [config.xml_attr_prefix.clone(), k.to_owned()].concat(),
374                parse_text(&v, config.leading_zero_as_string, &json_type_value),
375            );
376        }
377
378        // process child element recursively
379        for child in el.children() {
380            match convert_node(child, config, &path) {
381                Some(val) => {
382                    let name = &child.name().to_string();
383
384                    #[cfg(feature = "json_types")]
385                    let path = [path.clone(), "/".to_owned(), name.clone()].concat();
386                    let (json_type_array, _) = get_json_type(config, &path);
387                    // does it have to be an array?
388                    if json_type_array || data.contains_key(name) {
389                        // was this property converted to an array earlier?
390                        if data.get(name).unwrap_or(&Value::Null).is_array() {
391                            // add the new value to an existing array
392                            data.get_mut(name)
393                                .unwrap()
394                                .as_array_mut()
395                                .unwrap()
396                                .push(val);
397                        } else {
398                            // convert the property to an array with the existing and the new values
399                            let new_val = match data.remove(name) {
400                                None => vec![val],
401                                Some(temp) => vec![temp, val],
402                            };
403                            data.insert(name.clone(), Value::Array(new_val));
404                        }
405                    } else {
406                        // this is the first time this property is encountered and it doesn't
407                        // have to be an array, so add it as-is
408                        data.insert(name.clone(), val);
409                    }
410                }
411                _ => (),
412            }
413        }
414
415        // return the JSON object if it's not empty
416        if !data.is_empty() {
417            return Some(Value::Object(data));
418        }
419
420        // empty objects are treated according to config rules set by the caller
421        match config.empty_element_handling {
422            NullValue::Null => Some(Value::Null),
423            NullValue::EmptyObject => Some(Value::Object(data)),
424            NullValue::Ignore => None,
425        }
426    }
427}
428
429fn xml_to_map(e: &Element, config: &Config) -> Value {
430    let mut data = Map::new();
431    data.insert(
432        e.name().to_string(),
433        convert_node(&e, &config, &String::new()).unwrap_or(Value::Null),
434    );
435    Value::Object(data)
436}
437
438/// Converts the given XML string into `serde::Value` using settings from `Config` struct.
439pub fn xml_str_to_json(xml: &str, config: &Config) -> Result<Value, Error> {
440    let root = Element::from_str(xml)?;
441    Ok(xml_to_map(&root, config))
442}
443
444/// Converts the given XML string into `serde::Value` using settings from `Config` struct.
445pub fn xml_string_to_json(xml: String, config: &Config) -> Result<Value, Error> {
446    xml_str_to_json(xml.as_str(), config)
447}
448
449/// Returns a tuple for Array and Value enforcements for the current node or
450/// `(false, JsonArray::Infer(JsonType::Infer)` if the current path is not found
451/// in the list of paths with custom config.
452#[cfg(feature = "json_types")]
453#[inline]
454fn get_json_type_with_absolute_path<'conf>(config: &'conf Config, path: &String) -> (bool, &'conf JsonType) {
455    match config
456    .json_type_overrides
457    .get(path)
458    .unwrap_or(&JsonArray::Infer(JsonType::Infer))
459    {
460        JsonArray::Infer(v) => (false, v),
461        JsonArray::Always(v) => (true, v),
462    }
463}
464
465/// Simply returns `get_json_type_with_absolute_path` if `regex_path` feature is disabled.
466#[cfg(feature = "json_types")]
467#[cfg(not(feature = "regex_path"))]
468#[inline]
469fn get_json_type<'conf>(config: &'conf Config, path: &String) -> (bool, &'conf JsonType) {
470    get_json_type_with_absolute_path(config, path)
471}
472
473/// Returns a tuple for Array and Value enforcements for the current node. Searches both absolute paths
474/// and regex paths, giving precedence to regex paths. Returns `(false, JsonArray::Infer(JsonType::Infer)`
475/// if the current path is not found in the list of paths with custom config.
476#[cfg(feature = "json_types")]
477#[cfg(feature = "regex_path")]
478#[inline]
479fn get_json_type<'conf>(config: &'conf Config, path: &String) -> (bool, &'conf JsonType) {
480    for (regex, json_array) in &config.json_regex_type_overrides {
481        if regex.is_match(path) {
482            return match json_array {
483                JsonArray::Infer(v) => (false, v),
484                JsonArray::Always(v) => (true, v),
485            };
486        }
487    }
488
489    get_json_type_with_absolute_path(config, path)
490}
491
492/// Always returns `(false, JsonArray::Infer(JsonType::Infer)` if `json_types` feature is not enabled.
493#[cfg(not(feature = "json_types"))]
494#[inline]
495fn get_json_type<'conf>(_config: &'conf Config, _path: &String) -> (bool, &'conf JsonType) {
496    (false, &JsonType::Infer)
497}