xml/reader/config.rs
1//! Contains parser configuration structure.
2use std::collections::HashMap;
3use std::io::Read;
4
5use crate::reader::EventReader;
6use crate::util::Encoding;
7
8/// Limits to defend from billion laughs attack
9const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
10const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
11
12/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
13///
14/// This structure contains various configuration options which affect
15/// behavior of the parser.
16#[derive(Clone, PartialEq, Eq, Debug)]
17#[non_exhaustive]
18pub struct ParserConfig {
19 /// Whether or not should whitespace in textual events be removed. Default is false.
20 ///
21 /// When true, all standalone whitespace will be removed (this means no
22 /// `Whitespace` events will be emitted), and leading and trailing whitespace
23 /// from `Character` events will be deleted. If after trimming `Characters`
24 /// event will be empty, it will also be omitted from output stream. This is
25 /// possible, however, only if `whitespace_to_characters` or
26 /// `cdata_to_characters` options are set.
27 ///
28 /// This option does not affect CDATA events, unless `cdata_to_characters`
29 /// option is also set. In that case CDATA content will also be trimmed.
30 pub trim_whitespace: bool,
31
32 /// Whether or not should whitespace be converted to characters.
33 /// Default is false.
34 ///
35 /// If true, instead of `Whitespace` events `Characters` events with the
36 /// same content will be emitted. If `trim_whitespace` is also true, these
37 /// events will be trimmed to nothing and, consequently, not emitted.
38 pub whitespace_to_characters: bool,
39
40 /// Whether or not should CDATA be converted to characters.
41 /// Default is false.
42 ///
43 /// If true, instead of `CData` events `Characters` events with the same
44 /// content will be emitted. If `trim_whitespace` is also true, these events
45 /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
46 /// this event will be omitted from the stream.
47 pub cdata_to_characters: bool,
48
49 /// Whether or not should comments be omitted. Default is true.
50 ///
51 /// If true, `Comment` events will not be emitted at all.
52 pub ignore_comments: bool,
53
54 /// Whether or not should sequential `Characters` events be merged.
55 /// Default is true.
56 ///
57 /// If true, multiple sequential `Characters` events will be merged into
58 /// a single event, that is, their data will be concatenated.
59 ///
60 /// Multiple sequential `Characters` events are only possible if either
61 /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
62 /// events will always be separated by other events.
63 pub coalesce_characters: bool,
64
65 /// A map of extra entities recognized by the parser. Default is an empty map.
66 ///
67 /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
68 /// however, it is convenient to make the parser recognize additional entities which
69 /// are also not available through the DTD definitions (especially given that at the moment
70 /// DTD parsing is not supported).
71 pub extra_entities: HashMap<String, String>,
72
73 /// Whether or not the parser should ignore the end of stream. Default is false.
74 ///
75 /// By default the parser will either error out when it encounters a premature end of
76 /// stream or complete normally if the end of stream was expected. If you want to continue
77 /// reading from a stream whose input is supplied progressively, you can set this option to true.
78 /// In this case the parser will allow you to invoke the `next()` method even if a supposed end
79 /// of stream has happened.
80 ///
81 /// Note that support for this functionality is incomplete; for example, the parser will fail if
82 /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
83 pub ignore_end_of_stream: bool,
84
85 /// Whether or not non-unicode entity references get replaced with the replacement character
86 ///
87 /// When true, any decimal or hexadecimal character reference that cannot be converted from a
88 /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
89 /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
90 pub replace_unknown_entity_references: bool,
91
92 /// Whether or not whitespace at the root level of the document is ignored. Default is true.
93 ///
94 /// By default any whitespace that is not enclosed within at least one level of elements will be
95 /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
96 pub ignore_root_level_whitespace: bool,
97
98 /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
99 pub override_encoding: Option<Encoding>,
100
101 /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
102 /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
103 pub ignore_invalid_encoding_declarations: bool,
104
105 /// Documents with multiple root elements are ill-formed
106 pub allow_multiple_root_elements: bool,
107
108 /// Abort if custom entities create a string longer than this
109 pub max_entity_expansion_length: usize,
110
111 /// Entities can expand into other entities this many times (be careful about exponential cost!)
112 pub max_entity_expansion_depth: u8,
113
114 /// Maximum length of tag name or attribute name
115 pub max_name_length: usize,
116
117 /// Max number of attributes per element
118 pub max_attributes: usize,
119
120 /// Max number of bytes in each attribute
121 pub max_attribute_length: usize,
122
123 /// Maximum length of strings reprsenting characters, comments, and processing instructions
124 pub max_data_length: usize,
125}
126
127impl ParserConfig {
128 /// Returns a new config with default values.
129 ///
130 /// You can tweak default values using builder-like pattern:
131 ///
132 /// ```rust
133 /// use xml::reader::ParserConfig;
134 ///
135 /// let config = ParserConfig::new()
136 /// .trim_whitespace(true)
137 /// .ignore_comments(true)
138 /// .coalesce_characters(false);
139 /// ```
140 #[must_use]
141 #[inline]
142 pub fn new() -> Self {
143 Self {
144 trim_whitespace: false,
145 whitespace_to_characters: false,
146 cdata_to_characters: false,
147 ignore_comments: true,
148 coalesce_characters: true,
149 extra_entities: HashMap::new(),
150 ignore_end_of_stream: false,
151 replace_unknown_entity_references: false,
152 ignore_root_level_whitespace: true,
153
154 override_encoding: None,
155 ignore_invalid_encoding_declarations: false,
156 allow_multiple_root_elements: true,
157 max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
158 max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
159 max_attributes: 1 << 16,
160 max_attribute_length: 1 << 30,
161 max_data_length: 1 << 30,
162 max_name_length: 1 << 18,
163 }
164 }
165
166 /// Creates an XML reader with this configuration. The reader should be wrapped in a `BufReader`, otherwise parsing may be very slow.
167 ///
168 /// This is a convenience method for configuring and creating a reader at the same time:
169 ///
170 /// ```rust
171 /// use xml::reader::ParserConfig;
172 ///
173 /// let mut source: &[u8] = b"...";
174 ///
175 /// let reader = ParserConfig::new()
176 /// .trim_whitespace(true)
177 /// .ignore_comments(true)
178 /// .coalesce_characters(false)
179 /// .create_reader(&mut source);
180 /// ```
181 ///
182 /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
183 /// this configuration object.
184 #[inline]
185 pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
186 EventReader::new_with_config(source, self)
187 }
188
189 /// Adds a new entity mapping and returns an updated config object.
190 ///
191 /// This is a convenience method for adding external entities mappings to the XML parser.
192 /// An example:
193 ///
194 /// ```rust
195 /// use xml::reader::ParserConfig;
196 ///
197 /// let mut source: &[u8] = b"...";
198 ///
199 /// let reader = ParserConfig::new()
200 /// .add_entity("nbsp", " ")
201 /// .add_entity("copy", "©")
202 /// .add_entity("reg", "®")
203 /// .create_reader(&mut source);
204 /// ```
205 #[must_use]
206 pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> Self {
207 self.extra_entities.insert(entity.into(), value.into());
208 self
209 }
210}
211
212gen_setters! { ParserConfig,
213 trim_whitespace: val bool,
214 whitespace_to_characters: val bool,
215 cdata_to_characters: val bool,
216 ignore_comments: val bool,
217 coalesce_characters: val bool,
218 ignore_end_of_stream: val bool,
219 replace_unknown_entity_references: val bool,
220 /// Whether or not whitespace at the root level of the document is ignored. Default is true.
221 ignore_root_level_whitespace: val bool
222}
223
224impl Default for ParserConfig {
225 fn default() -> Self {
226 Self::new()
227 }
228}
229
230impl ParserConfig {
231 /// Read character encoding from `Content-Type` header.
232 /// Set this when parsing XML documents fetched over HTTP.
233 ///
234 /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
235 #[must_use]
236 pub fn content_type(mut self, mime_type: &str) -> Self {
237 let charset = mime_type.split_once(';')
238 .and_then(|(_, args)| args.split_once("charset"))
239 .and_then(|(_, args)| args.split_once('='));
240 if let Some((_, charset)) = charset {
241 let name = charset.trim().trim_matches('"');
242 if let Ok(enc) = name.parse() {
243 self.override_encoding = Some(enc);
244 }
245 }
246 self
247 }
248}
249
250gen_setters! { ParserConfig,
251 /// Set if you got one in the HTTP header
252 override_encoding: val Option<Encoding>,
253 /// Allows invalid documents. There should be only a single root element in XML.
254 allow_multiple_root_elements: val bool,
255 /// Abort if custom entities create a string longer than this
256 max_entity_expansion_length: val usize,
257 /// Entities can expand into other entities this many times (be careful about exponential cost!)
258 max_entity_expansion_depth: val u8,
259 /// Max number of attributes per element
260 max_attributes: val usize,
261 /// Maximum length of tag name or attribute name
262 max_name_length: val usize,
263 /// Max number of bytes in each attribute
264 max_attribute_length: val usize,
265 /// Maximum length of strings reprsenting characters, comments, and processing instructions
266 max_data_length: val usize,
267 /// Allow `<?xml encoding="bogus"?>`
268 ignore_invalid_encoding_declarations: val bool
269}
270
271#[test]
272fn mime_parse() {
273 let c = ParserConfig::new()
274 .content_type("text/xml;charset=Us-AScii")
275 .max_entity_expansion_length(1000);
276 assert_eq!(c.override_encoding, Some(Encoding::Ascii));
277
278 let c = ParserConfig::new()
279 .max_entity_expansion_depth(3)
280 .content_type("text/xml;charset = \"UTF-16\"");
281 assert_eq!(c.override_encoding, Some(Encoding::Utf16));
282}