xml_canonicalization/
lib.rs

1#![deny(unsafe_code)]
2#![warn(clippy::pedantic)]
3#![warn(missing_docs)]
4#![doc = include_str!("../README.md")]
5
6use std::{
7    fs::File,
8    io::{BufRead, BufReader, BufWriter, Write},
9    path::Path,
10    sync::Arc,
11};
12
13use namespace::{DepthSensitiveMap, Namespace};
14use quick_xml::{
15    events::{BytesPI, BytesStart, BytesText, Event},
16    Reader, Writer,
17};
18use regex::Regex;
19
20mod grammars;
21mod namespace;
22
23/// [`Canonicalizer`]s take XML and return the canonicalised form of that XML.
24pub struct Canonicalizer<R, W> {
25    reader: Reader<R>,
26    writer: Option<Writer<W>>,
27}
28
29impl<R, W> Canonicalizer<R, W> {
30    /// Initialize a new [`Canonicalizer`] that reads from the provided
31    /// `reader`.
32    #[must_use]
33    pub fn read_from_reader(reader: R) -> Self {
34        tracing::trace!("Canonicalizer initialising with reader");
35        Self {
36            reader: Reader::from_reader(reader),
37            writer: None,
38        }
39    }
40}
41
42impl<'a, W> Canonicalizer<&'a [u8], W> {
43    /// Initialize a new [`Canonicalizer`] that reads from the provided
44    /// `str`.
45    #[must_use]
46    pub fn read_from_str(str: &'a str) -> Self {
47        tracing::trace!("Canonicalizer initialising from &str");
48        Self {
49            reader: Reader::from_str(str),
50            writer: None,
51        }
52    }
53}
54
55impl<W> Canonicalizer<BufReader<File>, W> {
56    /// Initialize a new [`Canonicalizer`] that reads from the provided
57    /// `str`.
58    ///
59    /// # Errors
60    ///
61    /// Returns an I/O error if the file couldn't be opened and/or read.
62    pub fn read_from_file<P: AsRef<Path>>(path: P) -> Result<Self, Arc<std::io::Error>> {
63        tracing::trace!("Canonicalizer initialising from file");
64        Ok(Self {
65            reader: Reader::from_file(path).map_err(|e| match e {
66                quick_xml::Error::Io(e) => e,
67                _ => unreachable!("no other errors are raised by quick_xml at this point"),
68            })?,
69            writer: None,
70        })
71    }
72}
73
74impl<R, W: Write> Canonicalizer<R, W> {
75    /// Set the writer for this Canonicalizer to a writer.
76    #[must_use]
77    pub fn write_to_writer(mut self, writer: W) -> Self {
78        self.writer = Some(Writer::new(writer));
79        self
80    }
81}
82
83impl<R> Canonicalizer<R, BufWriter<File>> {
84    /// Set the writer for this Canonicalizer to a file.
85    ///
86    /// # Errors
87    ///
88    /// Returns an I/O error if the file couldn't be opened and/or read.
89    pub fn write_to_file<P: AsRef<Path>>(mut self, path: P) -> Result<Self, std::io::Error> {
90        self.writer = Some(Writer::new(BufWriter::new(File::create(path)?)));
91        Ok(self)
92    }
93}
94
95impl<R: BufRead, W: Write> Canonicalizer<R, W> {
96    /// Start canonicalizing the document to the writer.
97    ///
98    /// # Errors
99    ///
100    /// Returns an XML error if the XML parsed is invalid.
101    ///
102    /// # Panics
103    ///
104    /// This will panic is a writer has not been initialised with
105    /// `write_to_string`, `write_to_file`, or `write_to_writer`.
106    pub fn canonicalize(mut self, retain_comments: bool) -> Result<(), quick_xml::Error> {
107        tracing::debug!("Canonicalisation starting…");
108
109        // Set reader config
110        self.reader.config_mut().allow_unmatched_ends = false;
111        self.reader.config_mut().check_end_names = true;
112        self.reader.config_mut().expand_empty_elements = true;
113
114        let mut registered_namespaces = DepthSensitiveMap::new();
115        // Add a default blank namespace.
116        registered_namespaces.insert_at_depth(0, "_", Namespace { url: String::new() });
117        let mut writer = self
118            .writer
119            .expect("trying to canonicalize without a writer initialised");
120        let mut buf = vec![];
121        let mut depth = 0;
122        let mut hit_pi_rule = false;
123        let mut text_buf = String::new();
124        let whitespace_duplicate_regex = Regex::new(r"\n\n*").unwrap();
125        let pi_tidyup_regex = Regex::new(r"^(\S+)(?:\s*( .*)|\s*$)").unwrap();
126        loop {
127            let e = self.reader.read_event_into(&mut buf);
128            tracing::trace!("Event: {e:?}");
129            match e {
130                // The XML declaration and document type declaration (DTD) are removed.
131                Ok(Event::Decl(_) | Event::DocType(_)) => (),
132                // Empty should be unreachable as we've instructed the reader to always expand
133                Ok(Event::Empty(_)) => unreachable!(),
134                Ok(Event::PI(p)) => {
135                    hit_pi_rule = true;
136                    let p = p.into_inner();
137                    let p = String::from_utf8_lossy(&p).into_owned();
138                    let p = pi_tidyup_regex.replace_all(&p, "$1$2").into_owned();
139                    if !text_buf.is_empty() {
140                        writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
141                        text_buf.clear();
142                    }
143                    writer.write_event(Event::PI(BytesPI::new(p)))?;
144                }
145
146                // Remove all comments if needed
147                Ok(Event::Comment(c)) => {
148                    if retain_comments {
149                        if !text_buf.is_empty() {
150                            writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
151                            text_buf.clear();
152                        }
153                        writer.write_event(Event::Comment(c))?;
154                    }
155                }
156
157                Ok(Event::Text(t)) => {
158                    let text = t.into_inner();
159                    let text = String::from_utf8_lossy(&text);
160                    // Normalise whitespace
161                    let text = text.replace("\r\n", "\n");
162                    text_buf.push_str(
163                        &grammars::character_refs::canonicalize_character_references(
164                            &text,
165                            &grammars::character_refs::Situation::Content,
166                        )
167                        .unwrap(),
168                    );
169                    // Remove whitespace outside of the root tag
170                    if depth == 0 {
171                        if hit_pi_rule {
172                            // Remove duplicates only
173                            text_buf = whitespace_duplicate_regex
174                                .replace_all(&text_buf, "\n")
175                                .into_owned();
176                        } else {
177                            text_buf = text_buf.replace('\n', "");
178                        }
179                    }
180                }
181                Ok(Event::CData(c)) => {
182                    let c = c.into_inner();
183                    let c = String::from_utf8_lossy(&c);
184                    let c = c
185                        .replace('&', "&amp;")
186                        .replace('<', "&lt;")
187                        .replace('>', "&gt;");
188                    text_buf.push_str(&c);
189                }
190
191                Ok(Event::Start(s)) => {
192                    depth += 1;
193                    if !text_buf.is_empty() {
194                        writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
195                        text_buf.clear();
196                    }
197                    let s = String::from_utf8_lossy(&s);
198                    let s = grammars::start::canonicalize_start_tag(
199                        &s,
200                        depth,
201                        &mut registered_namespaces,
202                    )
203                    .unwrap();
204                    writer.write_event(Event::Start(BytesStart::new(s.trim())))?;
205                }
206                Ok(Event::End(e)) => {
207                    // Drop all known namespaces at base depth greater than new current
208                    registered_namespaces.remove_depth(depth);
209
210                    depth -= 1;
211
212                    if !text_buf.is_empty() {
213                        writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
214                        text_buf.clear();
215                    }
216                    writer.write_event(Event::End(e))?;
217                }
218
219                Ok(Event::Eof) => break,
220                Err(e) => return Err(e),
221            }
222        }
223
224        Ok(())
225    }
226}