xml_no_std/reader/config.rs
1//! Contains parser configuration structure.
2extern crate alloc;
3
4use alloc::collections::BTreeMap;
5use alloc::string::String;
6
7use crate::reader::EventReader;
8use crate::util::Encoding;
9
10/// Limits to defend from billion laughs attack
11const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
12const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
13
14/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
15///
16/// This structure contains various configuration options which affect
17/// behavior of the parser.
18#[derive(Clone, PartialEq, Eq, Debug)]
19pub struct ParserConfig {
20 /// Whether or not should whitespace in textual events be removed. Default is false.
21 ///
22 /// When true, all standalone whitespace will be removed (this means no
23 /// `Whitespace` events will be emitted), and leading and trailing whitespace
24 /// from `Character` events will be deleted. If after trimming `Characters`
25 /// event will be empty, it will also be omitted from output stream. This is
26 /// possible, however, only if `whitespace_to_characters` or
27 /// `cdata_to_characters` options are set.
28 ///
29 /// This option does not affect CDATA events, unless `cdata_to_characters`
30 /// option is also set. In that case CDATA content will also be trimmed.
31 pub trim_whitespace: bool,
32
33 /// Whether or not should whitespace be converted to characters.
34 /// Default is false.
35 ///
36 /// If true, instead of `Whitespace` events `Characters` events with the
37 /// same content will be emitted. If `trim_whitespace` is also true, these
38 /// events will be trimmed to nothing and, consequently, not emitted.
39 pub whitespace_to_characters: bool,
40
41 /// Whether or not should CDATA be converted to characters.
42 /// Default is false.
43 ///
44 /// If true, instead of `CData` events `Characters` events with the same
45 /// content will be emitted. If `trim_whitespace` is also true, these events
46 /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
47 /// this event will be omitted from the stream.
48 pub cdata_to_characters: bool,
49
50 /// Whether or not should comments be omitted. Default is true.
51 ///
52 /// If true, `Comment` events will not be emitted at all.
53 pub ignore_comments: bool,
54
55 /// Whether or not should sequential `Characters` events be merged.
56 /// Default is true.
57 ///
58 /// If true, multiple sequential `Characters` events will be merged into
59 /// a single event, that is, their data will be concatenated.
60 ///
61 /// Multiple sequential `Characters` events are only possible if either
62 /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
63 /// events will always be separated by other events.
64 pub coalesce_characters: bool,
65
66 /// A map of extra entities recognized by the parser. Default is an empty map.
67 ///
68 /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
69 /// however, it is convenient to make the parser recognize additional entities which
70 /// are also not available through the DTD definitions (especially given that at the moment
71 /// DTD parsing is not supported).
72 pub extra_entities: BTreeMap<String, String>,
73
74 /// Whether or not the parser should ignore the end of stream. Default is false.
75 ///
76 /// By default the parser will either error out when it encounters a premature end of
77 /// stream or complete normally if the end of stream was expected. If you want to continue
78 /// reading from a stream whose input is supplied progressively, you can set this option to true.
79 /// In this case the parser will allow you to invoke the `next()` method even if a supposed end
80 /// of stream has happened.
81 ///
82 /// Note that support for this functionality is incomplete; for example, the parser will fail if
83 /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
84 pub ignore_end_of_stream: bool,
85
86 /// Whether or not non-unicode entity references get replaced with the replacement character
87 ///
88 /// When true, any decimal or hexadecimal character reference that cannot be converted from a
89 /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
90 /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
91 pub replace_unknown_entity_references: bool,
92
93 /// Whether or not whitespace at the root level of the document is ignored. Default is true.
94 ///
95 /// By default any whitespace that is not enclosed within at least one level of elements will be
96 /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
97 ///
98 /// **There are configuration options – see methods below**
99 pub ignore_root_level_whitespace: bool,
100}
101
102impl ParserConfig {
103 /// Returns a new config with default values.
104 ///
105 /// You can tweak default values using builder-like pattern:
106 ///
107 /// ```rust
108 /// use xml_no_std::reader::ParserConfig;
109 ///
110 /// let config = ParserConfig::new()
111 /// .trim_whitespace(true)
112 /// .ignore_comments(true)
113 /// .coalesce_characters(false);
114 /// ```
115 #[must_use]
116 #[inline]
117 pub fn new() -> ParserConfig {
118 ParserConfig {
119 trim_whitespace: false,
120 whitespace_to_characters: false,
121 cdata_to_characters: false,
122 ignore_comments: true,
123 coalesce_characters: true,
124 extra_entities: BTreeMap::new(),
125 ignore_end_of_stream: false,
126 replace_unknown_entity_references: false,
127 ignore_root_level_whitespace: true,
128 }
129 }
130
131 /// Creates an XML reader with this configuration.
132 ///
133 /// This is a convenience method for configuring and creating a reader at the same time:
134 ///
135 /// ```rust
136 /// use xml_no_std::reader::ParserConfig;
137 ///
138 /// let mut source: &[u8] = b"...";
139 ///
140 /// let reader = ParserConfig::new()
141 /// .trim_whitespace(true)
142 /// .ignore_comments(true)
143 /// .coalesce_characters(false)
144 /// .create_reader(source.into_iter());
145 /// ```
146 ///
147 /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
148 /// this configuration object.
149 #[inline]
150 pub fn create_reader<'a, S: Iterator<Item = &'a u8>>(self, source: S) -> EventReader<'a, S> {
151 EventReader::new_with_config(source, self)
152 }
153
154 /// Adds a new entity mapping and returns an updated config object.
155 ///
156 /// This is a convenience method for adding external entities mappings to the XML parser.
157 /// An example:
158 ///
159 /// ```rust
160 /// use xml_no_std::reader::ParserConfig;
161 ///
162 /// let mut source: &[u8] = b"...";
163 ///
164 /// let reader = ParserConfig::new()
165 /// .add_entity("nbsp", " ")
166 /// .add_entity("copy", "©")
167 /// .add_entity("reg", "®")
168 /// .create_reader(source.into_iter());
169 /// ```
170 #[must_use]
171 pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
172 self.extra_entities.insert(entity.into(), value.into());
173 self
174 }
175}
176
177impl Default for ParserConfig {
178 #[inline]
179 fn default() -> ParserConfig {
180 ParserConfig::new()
181 }
182}
183
184gen_setters! { ParserConfig,
185 trim_whitespace: val bool,
186 whitespace_to_characters: val bool,
187 cdata_to_characters: val bool,
188 ignore_comments: val bool,
189 coalesce_characters: val bool,
190 ignore_end_of_stream: val bool,
191 replace_unknown_entity_references: val bool,
192 ignore_root_level_whitespace: val bool
193}
194
195/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
196#[derive(Clone, PartialEq, Eq, Debug)]
197#[non_exhaustive]
198pub struct ParserConfig2 {
199 pub(crate) c: ParserConfig,
200
201 /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
202 pub override_encoding: Option<Encoding>,
203
204 /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
205 /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
206 pub ignore_invalid_encoding_declarations: bool,
207
208 /// Documents with multiple root elements are ill-formed
209 pub allow_multiple_root_elements: bool,
210
211 /// Abort if custom entities create a string longer than this
212 pub max_entity_expansion_length: usize,
213 /// Entities can expand into other entities this many times (be careful about exponential cost!)
214 pub max_entity_expansion_depth: u8,
215
216 /// Maximum length of tag name or attribute name
217 pub max_name_length: usize,
218
219 /// Max number of attributes per element
220 pub max_attributes: usize,
221
222 /// Max number of bytes in each attribute
223 pub max_attribute_length: usize,
224
225 /// Maximum length of strings reprsenting characters, comments, and processing instructions
226 pub max_data_length: usize,
227}
228
229impl Default for ParserConfig2 {
230 fn default() -> Self {
231 ParserConfig2 {
232 c: ParserConfig::default(),
233 override_encoding: None,
234 ignore_invalid_encoding_declarations: false,
235 allow_multiple_root_elements: true,
236 max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
237 max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
238 max_attributes: 1<<16,
239 max_attribute_length: 1<<30,
240 max_data_length: 1<<30,
241 max_name_length: 1<<18,
242 }
243 }
244}
245
246impl ParserConfig2 {
247 /// Create extended configuration struct
248 #[inline]
249 #[must_use]
250 pub fn new() -> Self {
251 Self::default()
252 }
253
254 /// Read character encoding from `Content-Type` header.
255 /// Set this when parsing XML documents fetched over HTTP.
256 ///
257 /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
258 #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
259 let charset = mime_type.split_once(';')
260 .and_then(|(_, args)| args.split_once("charset"))
261 .and_then(|(_, args)| args.split_once('='));
262 if let Some((_, charset)) = charset {
263 let name = charset.trim().trim_matches('"');
264 if let Ok(enc) = name.parse() {
265 self.override_encoding = Some(enc);
266 }
267 }
268 self
269 }
270
271 /// Creates an XML reader with this configuration.
272 ///
273 /// This is a convenience method for configuring and creating a reader at the same time:
274 ///
275 /// ```rust
276 /// use xml_no_std::reader::ParserConfig;
277 ///
278 /// let mut source: &[u8] = b"...";
279 ///
280 /// let reader = ParserConfig::new()
281 /// .trim_whitespace(true)
282 /// .ignore_comments(true)
283 /// .coalesce_characters(false)
284 /// .create_reader(source.into_iter());
285 /// ```
286 ///
287 /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
288 /// this configuration object.
289 #[inline]
290 pub fn create_reader<'a, S: Iterator<Item = &'a u8>>(self, source: S) -> EventReader<'a, S> {
291 EventReader::new_with_config(source, self)
292 }
293}
294
295impl From<ParserConfig> for ParserConfig2 {
296 #[inline]
297 fn from(c: ParserConfig) -> Self {
298 Self {
299 c,
300 ..Default::default()
301 }
302 }
303}
304
305gen_setters! { ParserConfig2,
306 /// Set if you got one in the HTTP header
307 override_encoding: val Option<Encoding>,
308 /// Allows invalid documents. There should be only a single root element in XML.
309 allow_multiple_root_elements: val bool,
310 /// Abort if custom entities create a string longer than this
311 max_entity_expansion_length: val usize,
312 /// Entities can expand into other entities this many times (be careful about exponential cost!)
313 max_entity_expansion_depth: val u8,
314 /// Max number of attributes per element
315 max_attributes: val usize,
316 /// Maximum length of tag name or attribute name
317 max_name_length: val usize,
318 /// Max number of bytes in each attribute
319 max_attribute_length: val usize,
320 /// Maximum length of strings reprsenting characters, comments, and processing instructions
321 max_data_length: val usize,
322 /// Allow `<?xml encoding="bogus"?>`
323 ignore_invalid_encoding_declarations: val bool
324}
325
326gen_setters! { ParserConfig,
327 /// Set if you got one in the HTTP header (see `content_type`)
328 override_encoding: c2 Option<Encoding>,
329 /// Allow `<?xml encoding="bogus"?>`
330 ignore_invalid_encoding_declarations: c2 bool,
331 /// Allows invalid documents. There should be only a single root element in XML.
332 allow_multiple_root_elements: c2 bool,
333
334 /// Abort if custom entities create a string longer than this
335 max_entity_expansion_length: c2 usize,
336 /// Entities can expand into other entities this many times (be careful about exponential cost!)
337 max_entity_expansion_depth: c2 u8,
338 /// Max number of attributes per element
339 max_attributes: c2 usize,
340 /// Maximum length of tag name or attribute name
341 max_name_length: c2 usize,
342 /// Max number of bytes in each attribute
343 max_attribute_length: c2 usize,
344 /// Maximum length of strings reprsenting characters, comments, and processing instructions
345 max_data_length: c2 usize,
346
347 /// Set encoding from the MIME type. Important for HTTP compatibility.
348 content_type: c2 &str
349}
350
351gen_setters! { ParserConfig2,
352 trim_whitespace: delegate bool,
353 whitespace_to_characters: delegate bool,
354 cdata_to_characters: delegate bool,
355 ignore_comments: delegate bool,
356 coalesce_characters: delegate bool,
357 ignore_end_of_stream: delegate bool,
358 replace_unknown_entity_references: delegate bool,
359 /// Whether or not whitespace at the root level of the document is ignored. Default is true.
360 ignore_root_level_whitespace: delegate bool
361}
362
363#[test]
364fn mime_parse() {
365 let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
366 assert_eq!(c.override_encoding, Some(Encoding::Ascii));
367
368 let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
369 assert_eq!(c.override_encoding, Some(Encoding::Utf16));
370}