xml_no_std/reader/
config.rs

1//! Contains parser configuration structure.
2extern crate alloc;
3
4use alloc::collections::BTreeMap;
5use alloc::string::String;
6
7use crate::reader::EventReader;
8use crate::util::Encoding;
9
10/// Limits to defend from billion laughs attack
11const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
12const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
13
14/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
15///
16/// This structure contains various configuration options which affect
17/// behavior of the parser.
18#[derive(Clone, PartialEq, Eq, Debug)]
19pub struct ParserConfig {
20    /// Whether or not should whitespace in textual events be removed. Default is false.
21    ///
22    /// When true, all standalone whitespace will be removed (this means no
23    /// `Whitespace` events will be emitted), and leading and trailing whitespace
24    /// from `Character` events will be deleted. If after trimming `Characters`
25    /// event will be empty, it will also be omitted from output stream. This is
26    /// possible, however, only if `whitespace_to_characters` or
27    /// `cdata_to_characters` options are set.
28    ///
29    /// This option does not affect CDATA events, unless `cdata_to_characters`
30    /// option is also set. In that case CDATA content will also be trimmed.
31    pub trim_whitespace: bool,
32
33    /// Whether or not should whitespace be converted to characters.
34    /// Default is false.
35    ///
36    /// If true, instead of `Whitespace` events `Characters` events with the
37    /// same content will be emitted. If `trim_whitespace` is also true, these
38    /// events will be trimmed to nothing and, consequently, not emitted.
39    pub whitespace_to_characters: bool,
40
41    /// Whether or not should CDATA be converted to characters.
42    /// Default is false.
43    ///
44    /// If true, instead of `CData` events `Characters` events with the same
45    /// content will be emitted. If `trim_whitespace` is also true, these events
46    /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
47    /// this event will be omitted from the stream.
48    pub cdata_to_characters: bool,
49
50    /// Whether or not should comments be omitted. Default is true.
51    ///
52    /// If true, `Comment` events will not be emitted at all.
53    pub ignore_comments: bool,
54
55    /// Whether or not should sequential `Characters` events be merged.
56    /// Default is true.
57    ///
58    /// If true, multiple sequential `Characters` events will be merged into
59    /// a single event, that is, their data will be concatenated.
60    ///
61    /// Multiple sequential `Characters` events are only possible if either
62    /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
63    /// events will always be separated by other events.
64    pub coalesce_characters: bool,
65
66    /// A map of extra entities recognized by the parser. Default is an empty map.
67    ///
68    /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
69    /// however, it is convenient to make the parser recognize additional entities which
70    /// are also not available through the DTD definitions (especially given that at the moment
71    /// DTD parsing is not supported).
72    pub extra_entities: BTreeMap<String, String>,
73
74    /// Whether or not the parser should ignore the end of stream. Default is false.
75    ///
76    /// By default the parser will either error out when it encounters a premature end of
77    /// stream or complete normally if the end of stream was expected. If you want to continue
78    /// reading from a stream whose input is supplied progressively, you can set this option to true.
79    /// In this case the parser will allow you to invoke the `next()` method even if a supposed end
80    /// of stream has happened.
81    ///
82    /// Note that support for this functionality is incomplete; for example, the parser will fail if
83    /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
84    pub ignore_end_of_stream: bool,
85
86    /// Whether or not non-unicode entity references get replaced with the replacement character
87    ///
88    /// When true, any decimal or hexadecimal character reference that cannot be converted from a
89    /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
90    /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
91    pub replace_unknown_entity_references: bool,
92
93    /// Whether or not whitespace at the root level of the document is ignored. Default is true.
94    ///
95    /// By default any whitespace that is not enclosed within at least one level of elements will be
96    /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
97    ///
98    /// **There are configuration options – see methods below**
99    pub ignore_root_level_whitespace: bool,
100}
101
102impl ParserConfig {
103    /// Returns a new config with default values.
104    ///
105    /// You can tweak default values using builder-like pattern:
106    ///
107    /// ```rust
108    /// use xml_no_std::reader::ParserConfig;
109    ///
110    /// let config = ParserConfig::new()
111    ///     .trim_whitespace(true)
112    ///     .ignore_comments(true)
113    ///     .coalesce_characters(false);
114    /// ```
115    #[must_use]
116    #[inline]
117    pub fn new() -> ParserConfig {
118        ParserConfig {
119            trim_whitespace: false,
120            whitespace_to_characters: false,
121            cdata_to_characters: false,
122            ignore_comments: true,
123            coalesce_characters: true,
124            extra_entities: BTreeMap::new(),
125            ignore_end_of_stream: false,
126            replace_unknown_entity_references: false,
127            ignore_root_level_whitespace: true,
128        }
129    }
130
131    /// Creates an XML reader with this configuration.
132    ///
133    /// This is a convenience method for configuring and creating a reader at the same time:
134    ///
135    /// ```rust
136    /// use xml_no_std::reader::ParserConfig;
137    ///
138    /// let mut source: &[u8] = b"...";
139    ///
140    /// let reader = ParserConfig::new()
141    ///     .trim_whitespace(true)
142    ///     .ignore_comments(true)
143    ///     .coalesce_characters(false)
144    ///     .create_reader(source.into_iter());
145    /// ```
146    ///
147    /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
148    /// this configuration object.
149    #[inline]
150    pub fn create_reader<'a, S: Iterator<Item = &'a u8>>(self, source: S) -> EventReader<'a, S> {
151        EventReader::new_with_config(source, self)
152    }
153
154    /// Adds a new entity mapping and returns an updated config object.
155    ///
156    /// This is a convenience method for adding external entities mappings to the XML parser.
157    /// An example:
158    ///
159    /// ```rust
160    /// use xml_no_std::reader::ParserConfig;
161    ///
162    /// let mut source: &[u8] = b"...";
163    ///
164    /// let reader = ParserConfig::new()
165    ///     .add_entity("nbsp", " ")
166    ///     .add_entity("copy", "©")
167    ///     .add_entity("reg", "®")
168    ///     .create_reader(source.into_iter());
169    /// ```
170    #[must_use]
171    pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> ParserConfig {
172        self.extra_entities.insert(entity.into(), value.into());
173        self
174    }
175}
176
177impl Default for ParserConfig {
178    #[inline]
179    fn default() -> ParserConfig {
180        ParserConfig::new()
181    }
182}
183
184gen_setters! { ParserConfig,
185    trim_whitespace: val bool,
186    whitespace_to_characters: val bool,
187    cdata_to_characters: val bool,
188    ignore_comments: val bool,
189    coalesce_characters: val bool,
190    ignore_end_of_stream: val bool,
191    replace_unknown_entity_references: val bool,
192    ignore_root_level_whitespace: val bool
193}
194
195/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
196#[derive(Clone, PartialEq, Eq, Debug)]
197#[non_exhaustive]
198pub struct ParserConfig2 {
199    pub(crate) c: ParserConfig,
200
201    /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
202    pub override_encoding: Option<Encoding>,
203
204    /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
205    /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
206    pub ignore_invalid_encoding_declarations: bool,
207
208    /// Documents with multiple root elements are ill-formed
209    pub allow_multiple_root_elements: bool,
210
211    /// Abort if custom entities create a string longer than this
212    pub max_entity_expansion_length: usize,
213    /// Entities can expand into other entities this many times (be careful about exponential cost!)
214    pub max_entity_expansion_depth: u8,
215
216    /// Maximum length of tag name or attribute name
217    pub max_name_length: usize,
218
219    /// Max number of attributes per element
220    pub max_attributes: usize,
221
222    /// Max number of bytes in each attribute
223    pub max_attribute_length: usize,
224
225    /// Maximum length of strings reprsenting characters, comments, and processing instructions
226    pub max_data_length: usize,
227}
228
229impl Default for ParserConfig2 {
230    fn default() -> Self {
231        ParserConfig2 {
232            c: ParserConfig::default(),
233            override_encoding: None,
234            ignore_invalid_encoding_declarations: false,
235            allow_multiple_root_elements: true,
236            max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
237            max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
238            max_attributes: 1<<16,
239            max_attribute_length: 1<<30,
240            max_data_length: 1<<30,
241            max_name_length: 1<<18,
242        }
243    }
244}
245
246impl ParserConfig2 {
247    /// Create extended configuration struct
248    #[inline]
249    #[must_use]
250    pub fn new() -> Self {
251        Self::default()
252    }
253
254    /// Read character encoding from `Content-Type` header.
255    /// Set this when parsing XML documents fetched over HTTP.
256    ///
257    /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
258    #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
259        let charset = mime_type.split_once(';')
260            .and_then(|(_, args)| args.split_once("charset"))
261            .and_then(|(_, args)| args.split_once('='));
262        if let Some((_, charset)) = charset {
263            let name = charset.trim().trim_matches('"');
264            if let Ok(enc) = name.parse() {
265                self.override_encoding = Some(enc);
266            }
267        }
268        self
269    }
270
271    /// Creates an XML reader with this configuration.
272    ///
273    /// This is a convenience method for configuring and creating a reader at the same time:
274    ///
275    /// ```rust
276    /// use xml_no_std::reader::ParserConfig;
277    ///
278    /// let mut source: &[u8] = b"...";
279    ///
280    /// let reader = ParserConfig::new()
281    ///     .trim_whitespace(true)
282    ///     .ignore_comments(true)
283    ///     .coalesce_characters(false)
284    ///     .create_reader(source.into_iter());
285    /// ```
286    ///
287    /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
288    /// this configuration object.
289    #[inline]
290    pub fn create_reader<'a, S: Iterator<Item = &'a u8>>(self, source: S) -> EventReader<'a, S> {
291        EventReader::new_with_config(source, self)
292    }
293}
294
295impl From<ParserConfig> for ParserConfig2 {
296    #[inline]
297    fn from(c: ParserConfig) -> Self {
298        Self {
299            c,
300            ..Default::default()
301        }
302    }
303}
304
305gen_setters! { ParserConfig2,
306    /// Set if you got one in the HTTP header
307    override_encoding: val Option<Encoding>,
308    /// Allows invalid documents. There should be only a single root element in XML.
309    allow_multiple_root_elements: val bool,
310    /// Abort if custom entities create a string longer than this
311    max_entity_expansion_length: val usize,
312    /// Entities can expand into other entities this many times (be careful about exponential cost!)
313    max_entity_expansion_depth: val u8,
314    /// Max number of attributes per element
315    max_attributes: val usize,
316    /// Maximum length of tag name or attribute name
317    max_name_length: val usize,
318    /// Max number of bytes in each attribute
319    max_attribute_length: val usize,
320    /// Maximum length of strings reprsenting characters, comments, and processing instructions
321    max_data_length: val usize,
322    /// Allow `<?xml encoding="bogus"?>`
323    ignore_invalid_encoding_declarations: val bool
324}
325
326gen_setters! { ParserConfig,
327    /// Set if you got one in the HTTP header (see `content_type`)
328    override_encoding: c2 Option<Encoding>,
329    /// Allow `<?xml encoding="bogus"?>`
330    ignore_invalid_encoding_declarations: c2 bool,
331    /// Allows invalid documents. There should be only a single root element in XML.
332    allow_multiple_root_elements: c2 bool,
333
334    /// Abort if custom entities create a string longer than this
335    max_entity_expansion_length: c2 usize,
336    /// Entities can expand into other entities this many times (be careful about exponential cost!)
337    max_entity_expansion_depth: c2 u8,
338    /// Max number of attributes per element
339    max_attributes: c2 usize,
340    /// Maximum length of tag name or attribute name
341    max_name_length: c2 usize,
342    /// Max number of bytes in each attribute
343    max_attribute_length: c2 usize,
344    /// Maximum length of strings reprsenting characters, comments, and processing instructions
345    max_data_length: c2 usize,
346
347    /// Set encoding from the MIME type. Important for HTTP compatibility.
348    content_type: c2 &str
349}
350
351gen_setters! { ParserConfig2,
352    trim_whitespace: delegate bool,
353    whitespace_to_characters: delegate bool,
354    cdata_to_characters: delegate bool,
355    ignore_comments: delegate bool,
356    coalesce_characters: delegate bool,
357    ignore_end_of_stream: delegate bool,
358    replace_unknown_entity_references: delegate bool,
359    /// Whether or not whitespace at the root level of the document is ignored. Default is true.
360    ignore_root_level_whitespace: delegate bool
361}
362
363#[test]
364fn mime_parse() {
365    let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
366    assert_eq!(c.override_encoding, Some(Encoding::Ascii));
367
368    let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
369    assert_eq!(c.override_encoding, Some(Encoding::Utf16));
370}