1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
use crate::parser::{CommentParser, ElementParser, Parser, PiParser};
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum DtdParser {
/// If inside a PubidLiteral or SystemLiteral, it holds the quote type (either `'` or `"`).
/// Otherwise, it holds `0` (this is an initial state).
///
/// ```text
/// [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
/// ```
BeforeInternalSubset(u8),
/// Inside of the `intSubset` rule.
///
/// ```text
/// [28a] DeclSep ::= PEReference | S
/// [28b] intSubset ::= (markupdecl | DeclSep)*
/// [29] markupdecl ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
/// ```
InsideOfInternalSubset,
/// After `]` but before `>`.
AfterInternalSubset,
InComment(CommentParser),
InPi(PiParser),
/// ```text
/// [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>'
/// ```
InElementDecl,
/// This state handles ATTLIST, ENTITY and NOTATION elements, i.e. all elements that can have
/// quotes strings (`'...'` or `"..."`) inside their markup, in which `>` should not be threated
/// as the end of the markup.
///
/// This state handles the following productions from XML grammar:
///
/// ### ATTLIST
///
/// ```text
/// [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>'
/// [53] AttDef ::= S Name S AttType S DefaultDecl
/// [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' | (('#FIXED' S)? AttValue)
/// ```
///
/// ### ENTITY
///
/// ```text
/// [70] EntityDecl ::= GEDecl | PEDecl
/// [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>'
/// [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>'
/// [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
/// [74] PEDef ::= EntityValue | ExternalID
/// [75] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
/// [76] NDataDecl ::= S 'NDATA' S Name
/// ```
///
/// ### NOTATION
///
/// ```text
/// [82] NotationDecl ::= '<!NOTATION' S Name S (ExternalID | PublicID) S? '>'
/// ```
InQuoteSensitive(ElementParser),
/// The state where it was not possible to determine which markup it was during the previous iteration. \
/// It holds the number of bytes read since the start of the markup.
UndecidedMarkup(usize),
Finished,
}
impl DtdParser {
/// Skip DTD contents.
///
/// # Parameters (as same as `reader::BangType::parse`)
/// - `buf`: buffer with data consumed on previous iterations
/// - `chunk`: data read on current iteration and not yet consumed from reader
pub fn feed(&mut self, buf: &[u8], chunk: &[u8]) -> Option<usize> {
// This method assumes the DTD is well-formed.
// Since this crate does not support parsing DTDs, the inability to read non-well-formed DTDs
// is not particularly problematic; the only point of interest is reporting well-formed DTDs
// to the user without errors.
let mut cur = chunk;
while !cur.is_empty() {
match *self {
Self::BeforeInternalSubset(0) => {
// Find the
// - start of quoted string ('...' or "...")
// - start of internal subset ([...])
// - end of DOCTYPE declaration (>)
if let Some(i) = cur
.iter()
.position(|&b| matches!(b, b'\'' | b'"' | b'[' | b'>'))
{
let b = cur[i];
match b {
b'\'' | b'"' => {
// SystemLiteral or PubidLiteral
*self = Self::BeforeInternalSubset(b);
cur = &cur[i + 1..];
continue;
}
b'[' => {
*self = Self::InsideOfInternalSubset;
cur = &cur[i + 1..];
continue;
}
b'>' => {
*self = Self::Finished;
return Some(chunk.len() - cur.len() + i);
}
_ => {}
}
continue;
}
break;
}
// Inside the quoted string (this is PubidLiteral or SystemLiteral) we do not want to
// recognize other special characters (namely [ and >). Find only the closing quote
Self::BeforeInternalSubset(quote) => {
// ExternalID handling
if let Some(i) = memchr::memchr(quote, cur) {
*self = Self::BeforeInternalSubset(0);
cur = &cur[i + 1..];
continue;
}
break;
}
Self::InsideOfInternalSubset => {
// Find the end of internal subset ([) or the start of the markup inside (<)
if let Some(i) = memchr::memchr2(b']', b'<', cur) {
if cur[i] == b']' {
*self = Self::AfterInternalSubset;
cur = &cur[i + 1..]; // +1 to skip `]`
continue;
}
// +1 to start after `<`
if let Some(skip) = self.switch(&cur[i + 1..]) {
cur = &cur[i + 1 + skip..]; // +1 to skip `<`
continue;
}
// Keep the number of already looked bytes (started from byte after `<`, so -1),
// try to decide after feeding the new chunk
*self = Self::UndecidedMarkup(cur.len() - i - 1);
}
break;
}
Self::AfterInternalSubset => {
if let Some(i) = memchr::memchr(b'>', cur) {
*self = Self::Finished;
return Some(chunk.len() - cur.len() + i);
}
break;
}
Self::InComment(ref mut parser) => {
// If comment is ended, return to the main state, otherwise keep in the current state
if let Some(i) = parser.feed(cur) {
*self = Self::InsideOfInternalSubset;
cur = &cur[i..];
continue;
}
break;
}
Self::InPi(ref mut parser) => {
// If processing instruction is ended, return to the main state,
// otherwise keep in the current state
if let Some(i) = parser.feed(cur) {
*self = Self::InsideOfInternalSubset;
cur = &cur[i..];
continue;
}
break;
}
Self::InElementDecl => {
// `<!ELEMENT >` does not have places where `>` could be escaped
// so the first occurrence ends that state
if let Some(i) = memchr::memchr(b'>', cur) {
*self = Self::InsideOfInternalSubset;
cur = &cur[i + 1..]; // +1 for `>`
continue;
}
break;
}
Self::InQuoteSensitive(ref mut parser) => {
// If ATTLIST, ENTITY or NOTATION is ended, return to the main state,
// otherwise keep in the current state
if let Some(i) = parser.feed(cur) {
*self = Self::InsideOfInternalSubset;
cur = &cur[i..];
continue;
}
break;
}
Self::UndecidedMarkup(skipped) => {
// Buffer is long enough to store the longest possible keyword `!NOTATION`
let mut bytes = [0u8; 9];
// Copy the last `skipped` bytes from the previous iteration into buffer,
// for example, "!NOT" (skipped = 4 in that case)...
bytes[..skipped].copy_from_slice(&buf[buf.len() - skipped..]);
// ...add new bytes to the buffer from current iteration,
// for example, "ATION"...
let end = bytes.len().min(skipped + cur.len());
bytes[skipped..end].copy_from_slice(&cur[..end - skipped]);
// ...and try to match over it.
// For example, "!NOTATION" will return 9, and we skip 9-4=5 bytes of "ATION"
if let Some(skip) = self.switch(&bytes[..end]) {
cur = &cur[skip - skipped..];
continue;
}
*self = Self::UndecidedMarkup(skipped + cur.len());
break;
}
Self::Finished => break,
}
}
None
}
#[inline]
fn switch(&mut self, markup: &[u8]) -> Option<usize> {
match markup {
[b'?', ..] => {
// <?
*self = Self::InPi(PiParser(false));
Some(1)
}
[b'!', b'-', b'-', ..] => {
// <!--
*self = Self::InComment(CommentParser::Seen0);
Some(3)
}
[b'!', b'E', b'L', b'E', b'M', b'E', b'N', b'T', ..] => {
// <!ELEMENT
*self = Self::InElementDecl;
Some(8)
}
[b'!', b'E', b'N', b'T', b'I', b'T', b'Y', ..] => {
// <!ENTITY
*self = Self::InQuoteSensitive(ElementParser::Outside);
Some(7)
}
[b'!', b'A', b'T', b'T', b'L', b'I', b'S', b'T', ..] => {
// <!ATTLIST
*self = Self::InQuoteSensitive(ElementParser::Outside);
Some(8)
}
[b'!', b'N', b'O', b'T', b'A', b'T', b'I', b'O', b'N', ..] => {
// <!NOTATION
*self = Self::InQuoteSensitive(ElementParser::Outside);
Some(9)
}
// <... - `markup` does not have enough data to determine markup
// or markup is not known.
// Undecided markup bytes will be written to `buf` to be available on
// next iteration.
_ if markup.len() < 9 => None,
_ => {
// FIXME: to correctly report error position in DTD we need to provide
// DTD events. For now our task just to skip (correct) DTD, so we postpone
// error reporting and go with ending the unknown markup with `>`.
if let Some(i) = memchr::memchr(b'>', markup) {
*self = Self::InsideOfInternalSubset;
Some(i + 1)
} else {
Some(markup.len())
}
}
}
}
}