Skip to main content

xml_canonicalization/
lib.rs

1#![deny(unsafe_code)]
2#![warn(clippy::pedantic)]
3#![warn(missing_docs)]
4#![doc = include_str!("../README.md")]
5
6use std::{
7    fs::File,
8    io::{BufRead, BufReader, BufWriter, Write},
9    path::Path,
10    sync::Arc,
11};
12
13use namespace::{DepthSensitiveMap, Namespace};
14use quick_xml::{
15    events::{BytesPI, BytesStart, BytesText, Event},
16    Reader, Writer,
17};
18use regex::Regex;
19
20mod grammars;
21mod namespace;
22
23/// [`Canonicalizer`]s take XML and return the canonicalised form of that XML.
24pub struct Canonicalizer<R, W> {
25    reader: Reader<R>,
26    writer: Option<Writer<W>>,
27}
28
29impl<R, W> Canonicalizer<R, W> {
30    /// Initialize a new [`Canonicalizer`] that reads from the provided
31    /// `reader`.
32    #[must_use]
33    pub fn read_from_reader(reader: R) -> Self {
34        tracing::trace!("Canonicalizer initialising with reader");
35        Self {
36            reader: Reader::from_reader(reader),
37            writer: None,
38        }
39    }
40}
41
42impl<'a, W> Canonicalizer<&'a [u8], W> {
43    /// Initialize a new [`Canonicalizer`] that reads from the provided
44    /// `str`.
45    #[must_use]
46    pub fn read_from_str(str: &'a str) -> Self {
47        tracing::trace!("Canonicalizer initialising from &str");
48        Self {
49            reader: Reader::from_str(str),
50            writer: None,
51        }
52    }
53}
54
55impl<W> Canonicalizer<BufReader<File>, W> {
56    /// Initialize a new [`Canonicalizer`] that reads from the provided
57    /// `str`.
58    ///
59    /// # Errors
60    ///
61    /// Returns an I/O error if the file couldn't be opened and/or read.
62    pub fn read_from_file<P: AsRef<Path>>(path: P) -> Result<Self, Arc<std::io::Error>> {
63        tracing::trace!("Canonicalizer initialising from file");
64        Ok(Self {
65            reader: Reader::from_file(path).map_err(|e| match e {
66                quick_xml::Error::Io(e) => e,
67                _ => unreachable!("no other errors are raised by quick_xml at this point"),
68            })?,
69            writer: None,
70        })
71    }
72}
73
74impl<R, W: Write> Canonicalizer<R, W> {
75    /// Set the writer for this Canonicalizer to a writer.
76    #[must_use]
77    pub fn write_to_writer(mut self, writer: W) -> Self {
78        self.writer = Some(Writer::new(writer));
79        self
80    }
81}
82
83impl<R> Canonicalizer<R, BufWriter<File>> {
84    /// Set the writer for this Canonicalizer to a file.
85    ///
86    /// # Errors
87    ///
88    /// Returns an I/O error if the file couldn't be opened and/or read.
89    pub fn write_to_file<P: AsRef<Path>>(mut self, path: P) -> Result<Self, std::io::Error> {
90        self.writer = Some(Writer::new(BufWriter::new(File::create(path)?)));
91        Ok(self)
92    }
93}
94
95impl<R: BufRead, W: Write> Canonicalizer<R, W> {
96    /// Start canonicalizing the document to the writer.
97    ///
98    /// # Errors
99    ///
100    /// Returns an XML error if the XML parsed is invalid.
101    ///
102    /// # Panics
103    ///
104    /// This will panic is a writer has not been initialised with
105    /// `write_to_string`, `write_to_file`, or `write_to_writer`.
106    #[allow(clippy::too_many_lines)]
107    pub fn canonicalize(mut self, retain_comments: bool) -> Result<(), quick_xml::Error> {
108        tracing::debug!("Canonicalisation starting…");
109
110        // Set reader config
111        self.reader.config_mut().allow_unmatched_ends = false;
112        self.reader.config_mut().check_end_names = true;
113        self.reader.config_mut().expand_empty_elements = true;
114
115        let mut registered_namespaces = DepthSensitiveMap::new();
116        // Add a default blank namespace.
117        registered_namespaces.insert_at_depth(0, "_", Namespace { url: String::new() });
118        let mut writer = self
119            .writer
120            .expect("trying to canonicalize without a writer initialised");
121        let mut buf = vec![];
122        let mut depth = 0;
123        let mut hit_pi_rule = false;
124        let mut text_buf = String::new();
125        let whitespace_duplicate_regex = Regex::new(r"\n\n*").unwrap();
126        let pi_tidyup_regex = Regex::new(r"^(\S+)(?:\s*( .*)|\s*$)").unwrap();
127        loop {
128            let e = self.reader.read_event_into(&mut buf);
129            tracing::trace!("Event: {e:?}");
130            match e {
131                // The XML declaration and document type declaration (DTD) are removed.
132                Ok(Event::Decl(_) | Event::DocType(_)) => (),
133                // Empty should be unreachable as we've instructed the reader to always expand
134                Ok(Event::Empty(_)) => unreachable!(),
135                Ok(Event::PI(p)) => {
136                    hit_pi_rule = true;
137                    let p = p.into_inner();
138                    let p = String::from_utf8_lossy(&p).into_owned();
139                    let p = pi_tidyup_regex.replace_all(&p, "$1$2").into_owned();
140                    if !text_buf.is_empty() {
141                        writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
142                        text_buf.clear();
143                    }
144                    writer.write_event(Event::PI(BytesPI::new(p)))?;
145                }
146
147                // Remove all comments if needed
148                Ok(Event::Comment(c)) => {
149                    if retain_comments {
150                        if !text_buf.is_empty() {
151                            writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
152                            text_buf.clear();
153                        }
154                        writer.write_event(Event::Comment(c))?;
155                    }
156                }
157
158                Ok(Event::Text(t)) => {
159                    let text = t.into_inner();
160                    let text = String::from_utf8_lossy(&text);
161                    // Normalise whitespace
162                    let text = text.replace("\r\n", "\n");
163                    text_buf.push_str(
164                        &grammars::character_refs::canonicalize_character_references(
165                            &text,
166                            &grammars::character_refs::Situation::Content,
167                        )
168                        .unwrap(),
169                    );
170                    // Remove whitespace outside of the root tag
171                    if depth == 0 {
172                        if hit_pi_rule {
173                            // Remove duplicates only
174                            text_buf = whitespace_duplicate_regex
175                                .replace_all(&text_buf, "\n")
176                                .into_owned();
177                        } else {
178                            text_buf = text_buf.replace('\n', "");
179                        }
180                    }
181                }
182                Ok(Event::CData(c)) => {
183                    let c = c.into_inner();
184                    let c = String::from_utf8_lossy(&c);
185                    let c = c
186                        .replace('&', "&amp;")
187                        .replace('<', "&lt;")
188                        .replace('>', "&gt;");
189                    text_buf.push_str(&c);
190                }
191                Ok(Event::GeneralRef(b)) => {
192                    let b = b.into_inner();
193                    let b = String::from_utf8_lossy(&b);
194                    text_buf.push_str(
195                        &grammars::character_refs::canonicalize_character_reference(&b).unwrap(),
196                    );
197                }
198
199                Ok(Event::Start(s)) => {
200                    depth += 1;
201                    if !text_buf.is_empty() {
202                        writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
203                        text_buf.clear();
204                    }
205                    let s = String::from_utf8_lossy(&s);
206                    let s = grammars::start::canonicalize_start_tag(
207                        &s,
208                        depth,
209                        &mut registered_namespaces,
210                    )
211                    .unwrap();
212                    writer.write_event(Event::Start(BytesStart::new(s.trim())))?;
213                }
214                Ok(Event::End(e)) => {
215                    // Drop all known namespaces at base depth greater than new current
216                    registered_namespaces.remove_depth(depth);
217
218                    depth -= 1;
219
220                    if !text_buf.is_empty() {
221                        writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
222                        text_buf.clear();
223                    }
224                    writer.write_event(Event::End(e))?;
225                }
226
227                Ok(Event::Eof) => break,
228                Err(e) => return Err(e),
229            }
230        }
231
232        Ok(())
233    }
234}