biodivine_xml_doc/
document.rs

1use crate::element::{Element, ElementData};
2use crate::error::{Error, Result};
3use crate::parser::{DocumentParser, ReadOptions};
4use quick_xml::events::{BytesCData, BytesDecl, BytesEnd, BytesStart, BytesText, Event};
5use quick_xml::Writer;
6use std::collections::BTreeMap;
7use std::fs::File;
8use std::io::{Read, Write};
9use std::iter::FromIterator;
10use std::path::Path;
11use std::str::FromStr;
12
13/// Represents an XML node.
14#[derive(Debug)]
15pub enum Node {
16    /// XML Element
17    Element(Element),
18    /// XML Character Data ([specification](https://www.w3.org/TR/xml/#syntax))
19    Text(String),
20    /// Comments ([specification](https://www.w3.org/TR/xml/#sec-comments))
21    Comment(String),
22    /// CDATA ([specification](https://www.w3.org/TR/xml/#sec-cdata-sect))
23    CData(String),
24    /// Processing Instruction ([specification](https://www.w3.org/TR/xml/#sec-pi))
25    PI(String),
26    /// Document Type Declaration ([specification](https://www.w3.org/TR/xml/#sec-prolog-dtd))
27    DocType(String),
28}
29
30impl Node {
31    /// Useful to use inside `filter_map`.
32    ///
33    /// ```rust
34    /// use biodivine_xml_doc::{Document, Element};
35    ///
36    /// let mut doc = Document::parse_str(r#"<?xml version="1.0" encoding="UTF-8"?>
37    /// <config>
38    ///     Random Text
39    ///     <max>1</max>
40    /// </config>
41    /// "#).unwrap();
42    ///
43    /// let elems: Vec<Element> = doc
44    ///     .root_element()
45    ///     .unwrap()
46    ///     .children(&doc)
47    ///     .iter()
48    ///     .filter_map(|n| n.as_element())
49    ///     .collect();
50    /// ```
51    pub fn as_element(&self) -> Option<Element> {
52        match self {
53            Self::Element(elem) => Some(*elem),
54            _ => None,
55        }
56    }
57
58    pub(crate) fn build_text_content<'a>(&self, doc: &'a Document, buf: &'a mut String) {
59        match self {
60            Node::Element(elem) => elem.build_text_content(doc, buf),
61            Node::Text(text) => buf.push_str(text),
62            Node::CData(text) => buf.push_str(text),
63            Node::PI(text) => buf.push_str(text),
64            _ => {}
65        }
66    }
67
68    /// Returns content if node is `Text`, `CData`, or `PI`.
69    /// If node is `Element`, return [Element::text_content()]
70    ///
71    /// Implementation of [Node.textContent](https://developer.mozilla.org/en-US/docs/Web/API/Node/textContent)
72    pub fn text_content(&self, doc: &Document) -> String {
73        let mut buf = String::new();
74        self.build_text_content(doc, &mut buf);
75        buf
76    }
77}
78
79/// Represents a XML document or a document fragment.
80///
81/// To build a document from scratch, use [`Document::new`].
82///
83/// To read and modify an existing document, use [parse_*](`Document#parsing`) methods.
84///
85/// To write the document, use [write_*](`Document#writing`) methods.
86///
87/// # Examples
88/// ```rust
89/// use biodivine_xml_doc::Document;
90///
91/// let mut doc = Document::parse_str(r#"<?xml version="1.0" encoding="UTF-8"?>
92/// <package>
93///     <metadata>
94///         <author>Lewis Carol</author>
95///     </metadata>
96/// </package>
97/// "#).unwrap();
98/// let author_elem = doc
99///   .root_element()
100///   .unwrap()
101///   .find(&doc, "metadata")
102///   .unwrap()
103///   .find(&doc, "author")
104///   .unwrap();
105/// author_elem.set_text_content(&mut doc, "Lewis Carroll");
106/// let xml = doc.write_str();
107/// ```
108///
109
110#[derive(Debug)]
111pub struct Document {
112    pub(crate) counter: usize, // == self.store.len()
113    pub(crate) store: Vec<ElementData>,
114    container: Element,
115
116    pub(crate) version: String,
117    pub(crate) standalone: bool,
118}
119
120impl Default for Document {
121    fn default() -> Self {
122        Document::new()
123    }
124}
125
126impl Document {
127    /// Create a blank new xml document.
128    pub fn new() -> Document {
129        let (container, container_data) = Element::container();
130        Document {
131            counter: 1, // because container is id 0
132            store: vec![container_data],
133            container,
134            version: String::from("1.0"),
135            standalone: false,
136        }
137    }
138
139    /// Get 'container' element of Document.
140    ///
141    /// The document uses an invisible 'container' element
142    /// which it uses to manage its root nodes.
143    ///
144    /// Its parent is None, and trying to change its parent will
145    /// return [`Error::ContainerCannotMove`].
146    ///
147    /// For the container element, only its `children` is relevant.
148    /// Other attributes are not used.
149    pub fn container(&self) -> Element {
150        self.container
151    }
152
153    /// Returns `true` if document doesn't have any nodes.
154    /// Returns `false` if you added a node or parsed an xml.
155    ///
156    /// You can only call `parse_*()` if document is empty.
157    pub fn is_empty(&self) -> bool {
158        self.store.len() == 1
159    }
160
161    /// Get root nodes of document.
162    pub fn root_nodes(&self) -> &Vec<Node> {
163        self.container.children(self)
164    }
165
166    /// Get first root node that is an element.
167    pub fn root_element(&self) -> Option<Element> {
168        self.container.child_elements(self).first().copied()
169    }
170
171    /// Push a node to end of root nodes.
172    /// If doc has no [`Element`], pushing a [`Node::Element`] is
173    /// equivalent to setting it as root element.
174    pub fn push_root_node(&mut self, node: Node) -> Result<()> {
175        let elem = self.container;
176        elem.push_child(self, node)
177    }
178}
179
180/// &nbsp;
181/// # Parsing
182///
183/// Below are methods for parsing xml.
184/// Parsing from string, file, and reader is supported.
185///
186/// Call `parse_*_with_opts` with custom [`ReadOptions`] to change parser behaviour.
187/// Otherwise, [`ReadOptions::default()`] is used.
188///
189/// # Errors
190/// - [`Error::CannotDecode`]: Could not decode XML. XML declaration may have invalid encoding value.
191/// - [`Error::MalformedXML`]: Could not read XML.
192/// - [`Error::Io`]: IO Error
193impl Document {
194    pub fn parse_str(str: &str) -> Result<Document> {
195        DocumentParser::parse_reader(str.as_bytes(), ReadOptions::default())
196    }
197    pub fn parse_str_with_opts(str: &str, opts: ReadOptions) -> Result<Document> {
198        DocumentParser::parse_reader(str.as_bytes(), opts)
199    }
200
201    pub fn parse_file<P: AsRef<Path>>(path: P) -> Result<Document> {
202        let file = File::open(path)?;
203        DocumentParser::parse_reader(file, ReadOptions::default())
204    }
205    pub fn parse_file_with_opts<P: AsRef<Path>>(path: P, opts: ReadOptions) -> Result<Document> {
206        let file = File::open(path)?;
207        DocumentParser::parse_reader(file, opts)
208    }
209
210    pub fn parse_reader<R: Read>(reader: R) -> Result<Document> {
211        DocumentParser::parse_reader(reader, ReadOptions::default())
212    }
213    pub fn parse_reader_with_opts<R: Read>(reader: R, opts: ReadOptions) -> Result<Document> {
214        DocumentParser::parse_reader(reader, opts)
215    }
216}
217
218/// Options when writing XML.
219pub struct WriteOptions {
220    /// Byte character to indent with. (default: `b' '`)
221    pub indent_char: u8,
222    /// How many indent_char should be used for indent. (default: 2)
223    pub indent_size: usize,
224    /// XML declaration should be written at the top. (default: `true`)
225    pub write_decl: bool,
226}
227
228impl Default for WriteOptions {
229    fn default() -> Self {
230        WriteOptions {
231            indent_char: b' ',
232            indent_size: 2,
233            write_decl: true,
234        }
235    }
236}
237
238/// &nbsp;
239/// # Writing
240///
241/// Below are methods for writing xml.
242/// The XML will be written in UTF-8.
243impl Document {
244    pub fn write_file<P: AsRef<Path>>(&self, path: P) -> Result<()> {
245        self.write_file_with_opts(path, WriteOptions::default())
246    }
247    pub fn write_file_with_opts<P: AsRef<Path>>(&self, path: P, opts: WriteOptions) -> Result<()> {
248        let mut file = File::create(path)?;
249        self.write_with_opts(&mut file, opts)
250    }
251
252    pub fn write_str(&self) -> Result<String> {
253        self.write_str_with_opts(WriteOptions::default())
254    }
255    pub fn write_str_with_opts(&self, opts: WriteOptions) -> Result<String> {
256        let mut buf: Vec<u8> = Vec::with_capacity(200);
257        self.write_with_opts(&mut buf, opts)?;
258        Ok(String::from_utf8(buf)?)
259    }
260
261    pub fn write(&self, writer: &mut impl Write) -> Result<()> {
262        self.write_with_opts(writer, WriteOptions::default())
263    }
264    pub fn write_with_opts(&self, writer: &mut impl Write, opts: WriteOptions) -> Result<()> {
265        let container = self.container();
266        let mut writer = Writer::new_with_indent(writer, opts.indent_char, opts.indent_size);
267        if opts.write_decl {
268            self.write_decl(&mut writer)?;
269        }
270        self.write_nodes(&mut writer, container.children(self))?;
271        writer.write_event(Event::Eof)?;
272        Ok(())
273    }
274
275    fn write_decl(&self, writer: &mut Writer<impl Write>) -> Result<()> {
276        let standalone = match self.standalone {
277            true => Some("yes"),
278            false => None,
279        };
280        writer.write_event(Event::Decl(BytesDecl::new(
281            self.version.as_str(),
282            Some("UTF-8"),
283            standalone,
284        )))?;
285        Ok(())
286    }
287
288    fn write_nodes(&self, writer: &mut Writer<impl Write>, nodes: &[Node]) -> Result<()> {
289        for node in nodes {
290            match node {
291                Node::Element(eid) => self.write_element(writer, *eid)?,
292                Node::Text(text) => writer.write_event(Event::Text(BytesText::new(text)))?,
293                Node::DocType(text) => writer.write_event(Event::DocType(BytesText::new(text)))?,
294                // Comment, CData, and PI content is not escaped.
295                Node::Comment(text) => {
296                    writer.write_event(Event::Comment(BytesText::from_escaped(text)))?
297                }
298                Node::CData(text) => writer.write_event(Event::CData(BytesCData::new(text)))?,
299                Node::PI(text) => writer.write_event(Event::PI(BytesText::from_escaped(text)))?,
300            };
301        }
302        Ok(())
303    }
304
305    fn write_element(&self, writer: &mut Writer<impl Write>, element: Element) -> Result<()> {
306        let name_str = element.full_name(self);
307        let mut start = BytesStart::new(name_str);
308        // The copy in BTreeMap ensures that we have a deterministic iteration order.
309        let attributes = BTreeMap::from_iter(element.attributes(self).iter());
310        for (key, val) in attributes {
311            start.push_attribute((key.as_str(), val.as_str()));
312        }
313        let namespaces = BTreeMap::from_iter(element.namespace_decls(self).iter());
314        for (prefix, val) in namespaces {
315            let attr_name = if prefix.is_empty() {
316                "xmlns".to_string()
317            } else {
318                format!("xmlns:{}", prefix)
319            };
320            start.push_attribute((attr_name.as_str(), val.as_str()));
321        }
322        if element.has_children(self) {
323            writer.write_event(Event::Start(start))?;
324            self.write_nodes(writer, element.children(self))?;
325            writer.write_event(Event::End(BytesEnd::new(name_str)))?;
326        } else {
327            writer.write_event(Event::Empty(start))?;
328        }
329        Ok(())
330    }
331}
332
333impl FromStr for Document {
334    type Err = Error;
335
336    fn from_str(s: &str) -> Result<Document> {
337        Document::parse_str(s)
338    }
339}
340
341#[cfg(test)]
342mod tests {
343    use super::*;
344
345    #[test]
346    fn test_add_element() {
347        let xml = r#"<?xml version="1.0" encoding="UTF-8"?>
348        <basic>
349            Text
350            <c />
351        </basic>
352        "#;
353        let mut doc = Document::from_str(xml).unwrap();
354        let basic = doc.container().children(&doc)[0].as_element().unwrap();
355        let p = Element::new(&mut doc, "p");
356        basic.push_child(&mut doc, Node::Element(p)).unwrap();
357        assert_eq!(p.parent(&doc).unwrap(), basic);
358        assert_eq!(
359            p,
360            basic.children(&doc).last().unwrap().as_element().unwrap()
361        )
362    }
363
364    #[test]
365    fn test_enforce_encoding() {
366        // This document can be parsed without issues if we don't require a specific encoding,
367        // but it is not UTF-8 and hence should fail if we specifically request UTF-8.
368        let xml = "<?xml version=\"1.0\" encoding=\"US-ASCII\"?><test></test>";
369        assert!(Document::parse_str(xml).is_ok());
370        let mut opts = ReadOptions::default();
371        opts.enforce_encoding = true;
372        // We have not specified any encoding, hence this should always fail.
373        assert!(matches!(
374            Document::parse_str_with_opts(xml, opts.clone()),
375            Err(Error::CannotDecode)
376        ));
377        // With the correct encoding, this should now work.
378        opts.encoding = Some("US-ASCII".to_string());
379        let doc = Document::parse_str_with_opts(xml, opts.clone()).unwrap();
380        assert_eq!(doc.root_element().unwrap().name(&doc), "test");
381        // But with a different encoding, we should fail again.
382        opts.encoding = Some("UTF-8".to_string());
383        assert!(matches!(
384            Document::parse_str_with_opts(xml, opts.clone()),
385            Err(Error::CannotDecode)
386        ));
387
388        // Do a similar thing with a UTF document, because UTF gets special treatment in the
389        // library logic.
390        let xml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><test></test>";
391        assert!(Document::parse_str(xml).is_ok());
392        let mut opts = ReadOptions::default();
393        opts.enforce_encoding = true;
394        assert!(matches!(
395            Document::parse_str_with_opts(xml, opts.clone()),
396            Err(Error::CannotDecode)
397        ));
398        opts.encoding = Some("US-ASCII".to_string());
399        assert!(matches!(
400            Document::parse_str_with_opts(xml, opts.clone()),
401            Err(Error::CannotDecode)
402        ));
403        opts.encoding = Some("UTF-8".to_string());
404        let doc = Document::parse_str_with_opts(xml, opts.clone()).unwrap();
405        assert_eq!(doc.root_element().unwrap().name(&doc), "test");
406    }
407}