xml_canonicalization/
lib.rs1#![deny(unsafe_code)]
2#![warn(clippy::pedantic)]
3#![warn(missing_docs)]
4#![doc = include_str!("../README.md")]
5
6use std::{
7 fs::File,
8 io::{BufRead, BufReader, BufWriter, Write},
9 path::Path,
10 sync::Arc,
11};
12
13use namespace::{DepthSensitiveMap, Namespace};
14use quick_xml::{
15 events::{BytesPI, BytesStart, BytesText, Event},
16 Reader, Writer,
17};
18use regex::Regex;
19
20mod grammars;
21mod namespace;
22
23pub struct Canonicalizer<R, W> {
25 reader: Reader<R>,
26 writer: Option<Writer<W>>,
27}
28
29impl<R, W> Canonicalizer<R, W> {
30 #[must_use]
33 pub fn read_from_reader(reader: R) -> Self {
34 tracing::trace!("Canonicalizer initialising with reader");
35 Self {
36 reader: Reader::from_reader(reader),
37 writer: None,
38 }
39 }
40}
41
42impl<'a, W> Canonicalizer<&'a [u8], W> {
43 #[must_use]
46 pub fn read_from_str(str: &'a str) -> Self {
47 tracing::trace!("Canonicalizer initialising from &str");
48 Self {
49 reader: Reader::from_str(str),
50 writer: None,
51 }
52 }
53}
54
55impl<W> Canonicalizer<BufReader<File>, W> {
56 pub fn read_from_file<P: AsRef<Path>>(path: P) -> Result<Self, Arc<std::io::Error>> {
63 tracing::trace!("Canonicalizer initialising from file");
64 Ok(Self {
65 reader: Reader::from_file(path).map_err(|e| match e {
66 quick_xml::Error::Io(e) => e,
67 _ => unreachable!("no other errors are raised by quick_xml at this point"),
68 })?,
69 writer: None,
70 })
71 }
72}
73
74impl<R, W: Write> Canonicalizer<R, W> {
75 #[must_use]
77 pub fn write_to_writer(mut self, writer: W) -> Self {
78 self.writer = Some(Writer::new(writer));
79 self
80 }
81}
82
83impl<R> Canonicalizer<R, BufWriter<File>> {
84 pub fn write_to_file<P: AsRef<Path>>(mut self, path: P) -> Result<Self, std::io::Error> {
90 self.writer = Some(Writer::new(BufWriter::new(File::create(path)?)));
91 Ok(self)
92 }
93}
94
95impl<R: BufRead, W: Write> Canonicalizer<R, W> {
96 #[allow(clippy::too_many_lines)]
107 pub fn canonicalize(mut self, retain_comments: bool) -> Result<(), quick_xml::Error> {
108 tracing::debug!("Canonicalisation starting…");
109
110 self.reader.config_mut().allow_unmatched_ends = false;
112 self.reader.config_mut().check_end_names = true;
113 self.reader.config_mut().expand_empty_elements = true;
114
115 let mut registered_namespaces = DepthSensitiveMap::new();
116 registered_namespaces.insert_at_depth(0, "_", Namespace { url: String::new() });
118 let mut writer = self
119 .writer
120 .expect("trying to canonicalize without a writer initialised");
121 let mut buf = vec![];
122 let mut depth = 0;
123 let mut hit_pi_rule = false;
124 let mut text_buf = String::new();
125 let whitespace_duplicate_regex = Regex::new(r"\n\n*").unwrap();
126 let pi_tidyup_regex = Regex::new(r"^(\S+)(?:\s*( .*)|\s*$)").unwrap();
127 loop {
128 let e = self.reader.read_event_into(&mut buf);
129 tracing::trace!("Event: {e:?}");
130 match e {
131 Ok(Event::Decl(_) | Event::DocType(_)) => (),
133 Ok(Event::Empty(_)) => unreachable!(),
135 Ok(Event::PI(p)) => {
136 hit_pi_rule = true;
137 let p = p.into_inner();
138 let p = String::from_utf8_lossy(&p).into_owned();
139 let p = pi_tidyup_regex.replace_all(&p, "$1$2").into_owned();
140 if !text_buf.is_empty() {
141 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
142 text_buf.clear();
143 }
144 writer.write_event(Event::PI(BytesPI::new(p)))?;
145 }
146
147 Ok(Event::Comment(c)) => {
149 if retain_comments {
150 if !text_buf.is_empty() {
151 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
152 text_buf.clear();
153 }
154 writer.write_event(Event::Comment(c))?;
155 }
156 }
157
158 Ok(Event::Text(t)) => {
159 let text = t.into_inner();
160 let text = String::from_utf8_lossy(&text);
161 let text = text.replace("\r\n", "\n");
163 text_buf.push_str(
164 &grammars::character_refs::canonicalize_character_references(
165 &text,
166 &grammars::character_refs::Situation::Content,
167 )
168 .unwrap(),
169 );
170 if depth == 0 {
172 if hit_pi_rule {
173 text_buf = whitespace_duplicate_regex
175 .replace_all(&text_buf, "\n")
176 .into_owned();
177 } else {
178 text_buf = text_buf.replace('\n', "");
179 }
180 }
181 }
182 Ok(Event::CData(c)) => {
183 let c = c.into_inner();
184 let c = String::from_utf8_lossy(&c);
185 let c = c
186 .replace('&', "&")
187 .replace('<', "<")
188 .replace('>', ">");
189 text_buf.push_str(&c);
190 }
191 Ok(Event::GeneralRef(b)) => {
192 let b = b.into_inner();
193 let b = String::from_utf8_lossy(&b);
194 text_buf.push_str(
195 &grammars::character_refs::canonicalize_character_reference(&b).unwrap(),
196 );
197 }
198
199 Ok(Event::Start(s)) => {
200 depth += 1;
201 if !text_buf.is_empty() {
202 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
203 text_buf.clear();
204 }
205 let s = String::from_utf8_lossy(&s);
206 let s = grammars::start::canonicalize_start_tag(
207 &s,
208 depth,
209 &mut registered_namespaces,
210 )
211 .unwrap();
212 writer.write_event(Event::Start(BytesStart::new(s.trim())))?;
213 }
214 Ok(Event::End(e)) => {
215 registered_namespaces.remove_depth(depth);
217
218 depth -= 1;
219
220 if !text_buf.is_empty() {
221 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
222 text_buf.clear();
223 }
224 writer.write_event(Event::End(e))?;
225 }
226
227 Ok(Event::Eof) => break,
228 Err(e) => return Err(e),
229 }
230 }
231
232 Ok(())
233 }
234}