1use crate::{
2 Document, DocumentBuilder, NameData,
3 attributes::AttributeData,
4 error::{ErrorKind, Result},
5 memchr::{memchr, memchr2, memchr2_count, memchr4},
6 nodes::{ElementData, NodeData, NodeId},
7 strings::{StringBuf, StringsBuilder, cmp_names, cmp_opt_names},
8 tokenizer::{Reference, Tokenizer},
9};
10
11impl<'input> Document<'input> {
12 pub fn parse(text: &'input str) -> Result<Self> {
13 let mut parser = Parser::new(text)?;
14
15 let mut tokenizer = Tokenizer::new(text);
16 tokenizer.parse(&mut parser)?;
17
18 let doc = parser.doc.build();
19
20 if !doc.root().children().any(|child| child.is_element()) {
21 return ErrorKind::MissingRootElement.into();
22 }
23
24 Ok(doc)
25 }
26}
27
28pub(crate) struct Parser<'input> {
29 doc: DocumentBuilder<'input>,
30 element: Option<CurrElement<'input>>,
31 parent: NodeId,
32 subtree: Vec<NodeId>,
33 attributes: Vec<CurrAttribute<'input>>,
34 entities: Vec<Entity<'input>>,
35 entity_depth: u8,
36 entity_breadth: u8,
37}
38
39#[derive(Clone, Copy)]
40struct CurrElement<'input> {
41 prefix: Option<&'input str>,
42 local: &'input str,
43}
44
45#[derive(Clone, Copy)]
46struct CurrAttribute<'input> {
47 prefix: Option<&'input str>,
48 local: &'input str,
49 value: NodeId,
50}
51
52#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
53struct Entity<'input> {
54 name: &'input str,
55 value: &'input str,
56}
57
58impl<'input> Parser<'input> {
59 fn new(text: &'input str) -> Result<Self> {
60 let (nodes, attributes) = memchr2_count(b'<', b'=', text.as_bytes());
61
62 let mut doc = DocumentBuilder {
63 nodes: Vec::with_capacity(nodes),
64 elements: Vec::with_capacity(nodes / 2),
65 attributes: Vec::with_capacity(attributes),
66 strings: StringsBuilder::new(text, nodes / 2)?,
67 namespaces: Default::default(),
68 };
69
70 doc.nodes.push(NodeData {
71 element: None,
72 text: None,
73 parent: None,
74 prev_sibling: None,
75 next_subtree: None,
76 last_child: None,
77 });
78
79 let xml_uri = doc.strings.owned("http://www.w3.org/XML/1998/namespace")?;
80
81 doc.namespaces
82 .push(&mut doc.strings, 0, Some("xml"), xml_uri)?;
83
84 Ok(Self {
85 doc,
86 element: None,
87 parent: NodeId::ROOT,
88 subtree: Vec::new(),
89 attributes: Vec::new(),
90 entities: Vec::new(),
91 entity_depth: 0,
92 entity_breadth: 0,
93 })
94 }
95
96 pub(crate) fn open_element(&mut self, prefix: Option<&'input str>, local: &'input str) {
97 self.element = Some(CurrElement { prefix, local });
98 }
99
100 pub(crate) fn push_attribute(
101 &mut self,
102 tokenizer: &mut Tokenizer<'input>,
103 prefix: Option<&'input str>,
104 local: &'input str,
105 value: &'input str,
106 ) -> Result {
107 let value = self.normalize_attribute_value(tokenizer, value)?;
108
109 if cmp_opt_names(prefix, Some("xmlns")) {
110 self.doc.namespaces.push(
111 &mut self.doc.strings,
112 tokenizer.element_depth(),
113 Some(local),
114 value,
115 )?;
116 } else if prefix.is_none() && cmp_names(local, "xmlns") {
117 self.doc.namespaces.push(
118 &mut self.doc.strings,
119 tokenizer.element_depth(),
120 None,
121 value,
122 )?;
123 } else {
124 self.attributes.push(CurrAttribute {
125 prefix,
126 local,
127 value,
128 });
129 }
130
131 Ok(())
132 }
133
134 pub(crate) fn close_empty_element(&mut self, tokenizer: &Tokenizer<'input>) -> Result {
135 let (attributes_start, attributes_len) = self.resolve_attributes()?;
136
137 let Some(element) = self.element.take() else {
138 return ErrorKind::UnexpectedCloseElement.into();
139 };
140
141 let namespace = self.doc.namespaces.find(element.prefix)?;
142
143 let id = self.append_element_node(ElementData {
144 name: NameData {
145 namespace,
146 local: element.local,
147 },
148 attributes_start,
149 attributes_len,
150 })?;
151
152 self.subtree.push(id);
153
154 self.doc.namespaces.pop(tokenizer.element_depth());
155
156 Ok(())
157 }
158
159 pub(crate) fn close_element(
160 &mut self,
161 tokenizer: &mut Tokenizer<'input>,
162 prefix: Option<&'input str>,
163 local: &'input str,
164 ) -> Result {
165 self.element = None;
166
167 let parent = &self.doc.nodes[self.parent.get()];
168
169 if let Some(element) = parent.element {
170 let namespace = self.doc.namespaces.find(prefix)?;
171
172 let name = &self.doc.elements[element.get()].name;
173 let name_namespace = name.namespace;
174 let name_local = name.local;
175
176 if namespace != name_namespace || !cmp_names(local, name_local) {
177 return ErrorKind::UnexpectedCloseElement.into();
178 }
179 }
180
181 self.subtree.push(self.parent);
182
183 if let Some(ancestor) = parent.parent {
184 self.parent = ancestor;
185 } else {
186 return ErrorKind::UnexpectedCloseElement.into();
187 }
188
189 self.doc.namespaces.pop(tokenizer.element_depth());
190
191 Ok(())
192 }
193
194 pub(crate) fn close_open_element(&mut self) -> Result {
195 let (attributes_start, attributes_len) = self.resolve_attributes()?;
196
197 let Some(element) = self.element.take() else {
198 return ErrorKind::UnexpectedCloseElement.into();
199 };
200
201 let namespace = self.doc.namespaces.find(element.prefix)?;
202
203 let id = self.append_element_node(ElementData {
204 name: NameData {
205 namespace,
206 local: element.local,
207 },
208 attributes_start,
209 attributes_len,
210 })?;
211
212 self.parent = id;
213
214 Ok(())
215 }
216
217 fn append_node(&mut self, element: Option<NodeId>, text: Option<NodeId>) -> Result<NodeId> {
218 let new_id = NodeId::new(self.doc.nodes.len())?;
219
220 let prev_sibling = self.doc.nodes[self.parent.get()].last_child.replace(new_id);
221
222 self.doc.nodes.push(NodeData {
223 element,
224 text,
225 parent: Some(self.parent),
226 prev_sibling,
227 next_subtree: None,
228 last_child: None,
229 });
230
231 for &id in &self.subtree {
232 self.doc.nodes[id.get()].next_subtree = Some(new_id);
233 }
234
235 self.subtree.clear();
236
237 Ok(new_id)
238 }
239
240 fn append_element_node(&mut self, element: ElementData<'input>) -> Result<NodeId> {
241 let id = NodeId::new(self.doc.elements.len())?;
242
243 self.doc.elements.push(element);
244
245 self.append_node(Some(id), None)
246 }
247
248 fn append_text_node(&mut self, text: NodeId) -> Result<()> {
249 let id = self.append_node(None, Some(text))?;
250
251 self.subtree.push(id);
252
253 Ok(())
254 }
255
256 pub(crate) fn append_text(
257 &mut self,
258 tokenizer: &mut Tokenizer<'input>,
259 text: &'input str,
260 ) -> Result {
261 let pos = memchr2(b'&', b'\r', text.as_bytes());
262
263 if pos.is_none() {
264 let text = self.doc.strings.borrowed(text)?;
265
266 self.append_text_node(text)?;
267 return Ok(());
268 }
269
270 self.append_text_impl(tokenizer, text, pos)
271 }
272
273 #[cold]
274 #[inline(never)]
275 fn append_text_impl(
276 &mut self,
277 tokenizer: &mut Tokenizer<'input>,
278 mut text: &'input str,
279 mut pos: Option<usize>,
280 ) -> Result {
281 let mut strings = self.doc.strings.take();
282 let mut buf = StringBuf::new(&mut strings, text.len());
283
284 let mut was_cr = false;
285
286 while let Some(pos1) = pos {
287 let (before, after) = text.split_at(pos1);
288 buf.push_str(before);
289 text = after;
290
291 match text.as_bytes() {
292 [b'\r', b'\n', ..] => {
293 buf.push('\n');
294 text = &text[2..];
295 }
296 [b'\r', ..] => {
297 buf.push('\n');
298 text = &text[1..];
299 }
300 _ => {
301 text = &text[1..];
302
303 let ref_ =
304 tokenizer.with_text(&mut text, |tokenizer| tokenizer.parse_reference())?;
305
306 match ref_ {
307 Reference::Char(char_) => {
308 let is_entity = self.entity_depth != 0;
309
310 match char_ {
311 '\r' if is_entity => {
312 buf.push('\n');
313 was_cr = true;
314 }
315 '\n' if is_entity => {
316 if !was_cr {
317 buf.push('\n');
318 }
319 was_cr = false;
320 }
321 char_ => {
322 buf.push(char_);
323 was_cr = false;
324 }
325 }
326 }
327 Reference::Entity(name) => {
328 let mut value = self.find_entity(name)?;
329
330 if !buf.is_empty() {
331 let text = buf.finish()?;
332
333 self.append_text_node(text)?;
334 }
335
336 self.doc.strings = strings;
337
338 self.open_entity()?;
339 let element = self.element.take();
340
341 tokenizer
342 .with_text(&mut value, |tokenizer| tokenizer.parse_content(self))?;
343
344 self.element = element;
345 self.close_entity();
346
347 strings = self.doc.strings.take();
348 buf = StringBuf::new(&mut strings, 0);
349 }
350 }
351 }
352 }
353
354 pos = memchr2(b'&', b'\r', text.as_bytes());
355 }
356
357 buf.push_str(text);
358
359 if !buf.is_empty() {
360 let text = buf.finish()?;
361
362 self.append_text_node(text)?;
363 }
364
365 self.doc.strings = strings;
366
367 Ok(())
368 }
369
370 pub(crate) fn append_cdata(&mut self, cdata: &'input str) -> Result {
371 let pos = memchr(b'\r', cdata.as_bytes());
372
373 if pos.is_none() {
374 let text = self.doc.strings.borrowed(cdata)?;
375
376 self.append_text_node(text)?;
377 return Ok(());
378 }
379
380 self.append_cdata_impl(cdata, pos)
381 }
382
383 #[cold]
384 #[inline(never)]
385 fn append_cdata_impl(&mut self, mut cdata: &'input str, mut pos: Option<usize>) -> Result {
386 let mut buf = StringBuf::new(&mut self.doc.strings, cdata.len());
387
388 while let Some(pos1) = pos {
389 let (line, rest) = cdata.split_at(pos1);
390
391 buf.push_str(line);
392 buf.push('\n');
393
394 cdata = match rest.as_bytes().get(1) {
395 Some(&b'\n') => &rest[2..],
396 _ => &rest[1..],
397 };
398
399 pos = memchr(b'\r', cdata.as_bytes());
400 }
401
402 buf.push_str(cdata);
403
404 let text = buf.finish()?;
405
406 self.append_text_node(text)?;
407 Ok(())
408 }
409
410 fn normalize_attribute_value(
411 &mut self,
412 tokenizer: &mut Tokenizer<'input>,
413 value: &'input str,
414 ) -> Result<NodeId> {
415 let pos = memchr4(b'&', b'\t', b'\r', b'\n', value.as_bytes());
416
417 if pos.is_none() {
418 return self.doc.strings.borrowed(value);
419 }
420
421 let mut strings = self.doc.strings.take();
422 let mut buf = StringBuf::new(&mut strings, value.len());
423
424 self.normalize_attribute_value_impl(tokenizer, value, pos, &mut buf)?;
425
426 let value = buf.finish()?;
427 self.doc.strings = strings;
428
429 Ok(value)
430 }
431
432 #[cold]
433 #[inline(never)]
434 fn normalize_attribute_value_impl(
435 &mut self,
436 tokenizer: &mut Tokenizer<'input>,
437 mut value: &'input str,
438 mut pos: Option<usize>,
439 buf: &mut StringBuf<'_, 'input>,
440 ) -> Result {
441 while let Some(pos1) = pos {
442 let (before, after) = value.split_at(pos1);
443 buf.push_str(before);
444 value = after;
445
446 match value.as_bytes() {
447 [b'\r', b'\n', ..] => {
448 buf.push(' ');
449 value = &value[2..];
450 }
451 [b'\t' | b'\r' | b'\n', ..] => {
452 buf.push(' ');
453 value = &value[1..];
454 }
455 _ => {
456 value = &value[1..];
457
458 let ref_ =
459 tokenizer.with_text(&mut value, |tokenizer| tokenizer.parse_reference())?;
460
461 match ref_ {
462 Reference::Char(char_) => {
463 let is_entity = self.entity_depth != 0;
464
465 let char_ = match char_ {
466 '\t' | '\r' | '\n' if is_entity => ' ',
467 char_ => char_,
468 };
469
470 buf.push(char_);
471 }
472 Reference::Entity(name) => {
473 let value = self.find_entity(name)?;
474
475 let pos = memchr4(b'&', b'\t', b'\r', b'\n', value.as_bytes());
476
477 if pos.is_none() {
478 buf.push_str(value);
479 } else {
480 self.open_entity()?;
481
482 self.normalize_attribute_value_impl(tokenizer, value, pos, buf)?;
483
484 self.close_entity();
485 }
486 }
487 }
488 }
489 }
490
491 pos = memchr4(b'&', b'\t', b'\r', b'\n', value.as_bytes());
492 }
493
494 buf.push_str(value);
495
496 Ok(())
497 }
498
499 fn resolve_attributes(&mut self) -> Result<(u32, u16)> {
500 let start = self.doc.attributes.len();
501 let len = self.attributes.len();
502
503 if start > u32::MAX as usize || len > u16::MAX as usize {
504 return ErrorKind::TooManyAttributes.into();
505 }
506
507 for attribute in &self.attributes {
508 let namespace = if attribute.prefix.is_none() {
509 None
510 } else {
511 self.doc.namespaces.find(attribute.prefix)?
512 };
513
514 self.doc.attributes.push(AttributeData {
515 name: NameData {
516 namespace,
517 local: attribute.local,
518 },
519 value: attribute.value,
520 });
521 }
522
523 self.attributes.clear();
524
525 Ok((start as u32, len as u16))
526 }
527
528 pub(crate) fn push_entity(&mut self, name: &'input str, value: &'input str) {
529 if let Err(idx) = self
530 .entities
531 .binary_search_by_key(&name, |entity| entity.name)
532 {
533 self.entities.insert(idx, Entity { name, value });
534 }
535 }
536
537 fn find_entity(&self, name: &'input str) -> Result<&'input str> {
538 match self
539 .entities
540 .binary_search_by_key(&name, |entity| entity.name)
541 {
542 Ok(idx) => Ok(self.entities[idx].value),
543 Err(_idx) => ErrorKind::UnknownEntity(name.to_owned()).into(),
544 }
545 }
546
547 fn open_entity(&mut self) -> Result {
548 if self.entity_depth != 0 {
549 if self.entity_breadth == 100 {
550 return ErrorKind::TooManyEntityReferences.into();
551 }
552
553 self.entity_breadth += 1;
554 }
555
556 if self.entity_depth == 10 {
557 return ErrorKind::TooManyEntityReferences.into();
558 }
559
560 self.entity_depth += 1;
561
562 Ok(())
563 }
564
565 fn close_entity(&mut self) {
566 if self.entity_depth != 0 {
567 self.entity_depth -= 1;
568 }
569
570 if self.entity_depth == 0 {
571 self.entity_breadth = 0;
572 }
573 }
574}