xml_canonicalization/
lib.rs1#![deny(unsafe_code)]
2#![warn(clippy::pedantic)]
3#![warn(missing_docs)]
4#![doc = include_str!("../README.md")]
5
6use std::{
7 fs::File,
8 io::{BufRead, BufReader, BufWriter, Write},
9 path::Path,
10 sync::Arc,
11};
12
13use namespace::{DepthSensitiveMap, Namespace};
14use quick_xml::{
15 events::{BytesPI, BytesStart, BytesText, Event},
16 Reader, Writer,
17};
18use regex::Regex;
19
20mod grammars;
21mod namespace;
22
23pub struct Canonicalizer<R, W> {
25 reader: Reader<R>,
26 writer: Option<Writer<W>>,
27}
28
29impl<R, W> Canonicalizer<R, W> {
30 #[must_use]
33 pub fn read_from_reader(reader: R) -> Self {
34 tracing::trace!("Canonicalizer initialising with reader");
35 Self {
36 reader: Reader::from_reader(reader),
37 writer: None,
38 }
39 }
40}
41
42impl<'a, W> Canonicalizer<&'a [u8], W> {
43 #[must_use]
46 pub fn read_from_str(str: &'a str) -> Self {
47 tracing::trace!("Canonicalizer initialising from &str");
48 Self {
49 reader: Reader::from_str(str),
50 writer: None,
51 }
52 }
53}
54
55impl<W> Canonicalizer<BufReader<File>, W> {
56 pub fn read_from_file<P: AsRef<Path>>(path: P) -> Result<Self, Arc<std::io::Error>> {
63 tracing::trace!("Canonicalizer initialising from file");
64 Ok(Self {
65 reader: Reader::from_file(path).map_err(|e| match e {
66 quick_xml::Error::Io(e) => e,
67 _ => unreachable!("no other errors are raised by quick_xml at this point"),
68 })?,
69 writer: None,
70 })
71 }
72}
73
74impl<R, W: Write> Canonicalizer<R, W> {
75 #[must_use]
77 pub fn write_to_writer(mut self, writer: W) -> Self {
78 self.writer = Some(Writer::new(writer));
79 self
80 }
81}
82
83impl<R> Canonicalizer<R, BufWriter<File>> {
84 pub fn write_to_file<P: AsRef<Path>>(mut self, path: P) -> Result<Self, std::io::Error> {
90 self.writer = Some(Writer::new(BufWriter::new(File::create(path)?)));
91 Ok(self)
92 }
93}
94
95impl<R: BufRead, W: Write> Canonicalizer<R, W> {
96 pub fn canonicalize(mut self, retain_comments: bool) -> Result<(), quick_xml::Error> {
107 tracing::debug!("Canonicalisation starting…");
108
109 self.reader.config_mut().allow_unmatched_ends = false;
111 self.reader.config_mut().check_end_names = true;
112 self.reader.config_mut().expand_empty_elements = true;
113
114 let mut registered_namespaces = DepthSensitiveMap::new();
115 registered_namespaces.insert_at_depth(0, "_", Namespace { url: String::new() });
117 let mut writer = self
118 .writer
119 .expect("trying to canonicalize without a writer initialised");
120 let mut buf = vec![];
121 let mut depth = 0;
122 let mut hit_pi_rule = false;
123 let mut text_buf = String::new();
124 let whitespace_duplicate_regex = Regex::new(r"\n\n*").unwrap();
125 let pi_tidyup_regex = Regex::new(r"^(\S+)(?:\s*( .*)|\s*$)").unwrap();
126 loop {
127 let e = self.reader.read_event_into(&mut buf);
128 tracing::trace!("Event: {e:?}");
129 match e {
130 Ok(Event::Decl(_) | Event::DocType(_)) => (),
132 Ok(Event::Empty(_)) => unreachable!(),
134 Ok(Event::PI(p)) => {
135 hit_pi_rule = true;
136 let p = p.into_inner();
137 let p = String::from_utf8_lossy(&p).into_owned();
138 let p = pi_tidyup_regex.replace_all(&p, "$1$2").into_owned();
139 if !text_buf.is_empty() {
140 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
141 text_buf.clear();
142 }
143 writer.write_event(Event::PI(BytesPI::new(p)))?;
144 }
145
146 Ok(Event::Comment(c)) => {
148 if retain_comments {
149 if !text_buf.is_empty() {
150 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
151 text_buf.clear();
152 }
153 writer.write_event(Event::Comment(c))?;
154 }
155 }
156
157 Ok(Event::Text(t)) => {
158 let text = t.into_inner();
159 let text = String::from_utf8_lossy(&text);
160 let text = text.replace("\r\n", "\n");
162 text_buf.push_str(
163 &grammars::character_refs::canonicalize_character_references(
164 &text,
165 &grammars::character_refs::Situation::Content,
166 )
167 .unwrap(),
168 );
169 if depth == 0 {
171 if hit_pi_rule {
172 text_buf = whitespace_duplicate_regex
174 .replace_all(&text_buf, "\n")
175 .into_owned();
176 } else {
177 text_buf = text_buf.replace('\n', "");
178 }
179 }
180 }
181 Ok(Event::CData(c)) => {
182 let c = c.into_inner();
183 let c = String::from_utf8_lossy(&c);
184 let c = c
185 .replace('&', "&")
186 .replace('<', "<")
187 .replace('>', ">");
188 text_buf.push_str(&c);
189 }
190
191 Ok(Event::Start(s)) => {
192 depth += 1;
193 if !text_buf.is_empty() {
194 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
195 text_buf.clear();
196 }
197 let s = String::from_utf8_lossy(&s);
198 let s = grammars::start::canonicalize_start_tag(
199 &s,
200 depth,
201 &mut registered_namespaces,
202 )
203 .unwrap();
204 writer.write_event(Event::Start(BytesStart::new(s.trim())))?;
205 }
206 Ok(Event::End(e)) => {
207 registered_namespaces.remove_depth(depth);
209
210 depth -= 1;
211
212 if !text_buf.is_empty() {
213 writer.write_event(Event::Text(BytesText::from_escaped(&text_buf)))?;
214 text_buf.clear();
215 }
216 writer.write_event(Event::End(e))?;
217 }
218
219 Ok(Event::Eof) => break,
220 Err(e) => return Err(e),
221 }
222 }
223
224 Ok(())
225 }
226}