robinson/
parser.rs

1use crate::{
2    Document, DocumentBuilder, NameData,
3    attributes::AttributeData,
4    error::{ErrorKind, Result},
5    memchr::{memchr, memchr2, memchr2_count, memchr4},
6    nodes::{ElementData, NodeData, NodeId},
7    strings::{StringBuf, StringsBuilder, cmp_names, cmp_opt_names},
8    tokenizer::{Reference, Tokenizer},
9};
10
11impl<'input> Document<'input> {
12    pub fn parse(text: &'input str) -> Result<Self> {
13        let mut parser = Parser::new(text)?;
14
15        let mut tokenizer = Tokenizer::new(text);
16        tokenizer.parse(&mut parser)?;
17
18        let doc = parser.doc.build();
19
20        if !doc.root().children().any(|child| child.is_element()) {
21            return ErrorKind::MissingRootElement.into();
22        }
23
24        Ok(doc)
25    }
26}
27
28pub(crate) struct Parser<'input> {
29    doc: DocumentBuilder<'input>,
30    element: Option<CurrElement<'input>>,
31    parent: NodeId,
32    subtree: Vec<NodeId>,
33    attributes: Vec<CurrAttribute<'input>>,
34    entities: Vec<Entity<'input>>,
35    entity_depth: u8,
36    entity_breadth: u8,
37}
38
39#[derive(Clone, Copy)]
40struct CurrElement<'input> {
41    prefix: Option<&'input str>,
42    local: &'input str,
43}
44
45#[derive(Clone, Copy)]
46struct CurrAttribute<'input> {
47    prefix: Option<&'input str>,
48    local: &'input str,
49    value: NodeId,
50}
51
52#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
53struct Entity<'input> {
54    name: &'input str,
55    value: &'input str,
56}
57
58impl<'input> Parser<'input> {
59    fn new(text: &'input str) -> Result<Self> {
60        let (nodes, attributes) = memchr2_count(b'<', b'=', text.as_bytes());
61
62        let mut doc = DocumentBuilder {
63            nodes: Vec::with_capacity(nodes),
64            elements: Vec::with_capacity(nodes / 2),
65            attributes: Vec::with_capacity(attributes),
66            strings: StringsBuilder::new(text, nodes / 2)?,
67            namespaces: Default::default(),
68        };
69
70        doc.nodes.push(NodeData {
71            element: None,
72            text: None,
73            parent: None,
74            prev_sibling: None,
75            next_subtree: None,
76            last_child: None,
77        });
78
79        let xml_uri = doc.strings.owned("http://www.w3.org/XML/1998/namespace")?;
80
81        doc.namespaces
82            .push(&mut doc.strings, 0, Some("xml"), xml_uri)?;
83
84        Ok(Self {
85            doc,
86            element: None,
87            parent: NodeId::ROOT,
88            subtree: Vec::new(),
89            attributes: Vec::new(),
90            entities: Vec::new(),
91            entity_depth: 0,
92            entity_breadth: 0,
93        })
94    }
95
96    pub(crate) fn open_element(&mut self, prefix: Option<&'input str>, local: &'input str) {
97        self.element = Some(CurrElement { prefix, local });
98    }
99
100    pub(crate) fn push_attribute(
101        &mut self,
102        tokenizer: &mut Tokenizer<'input>,
103        prefix: Option<&'input str>,
104        local: &'input str,
105        value: &'input str,
106    ) -> Result {
107        let value = self.normalize_attribute_value(tokenizer, value)?;
108
109        if cmp_opt_names(prefix, Some("xmlns")) {
110            self.doc.namespaces.push(
111                &mut self.doc.strings,
112                tokenizer.element_depth(),
113                Some(local),
114                value,
115            )?;
116        } else if prefix.is_none() && cmp_names(local, "xmlns") {
117            self.doc.namespaces.push(
118                &mut self.doc.strings,
119                tokenizer.element_depth(),
120                None,
121                value,
122            )?;
123        } else {
124            self.attributes.push(CurrAttribute {
125                prefix,
126                local,
127                value,
128            });
129        }
130
131        Ok(())
132    }
133
134    pub(crate) fn close_empty_element(&mut self, tokenizer: &Tokenizer<'input>) -> Result {
135        let (attributes_start, attributes_len) = self.resolve_attributes()?;
136
137        let Some(element) = self.element.take() else {
138            return ErrorKind::UnexpectedCloseElement.into();
139        };
140
141        let namespace = self.doc.namespaces.find(element.prefix)?;
142
143        let id = self.append_element_node(ElementData {
144            name: NameData {
145                namespace,
146                local: element.local,
147            },
148            attributes_start,
149            attributes_len,
150        })?;
151
152        self.subtree.push(id);
153
154        self.doc.namespaces.pop(tokenizer.element_depth());
155
156        Ok(())
157    }
158
159    pub(crate) fn close_element(
160        &mut self,
161        tokenizer: &mut Tokenizer<'input>,
162        prefix: Option<&'input str>,
163        local: &'input str,
164    ) -> Result {
165        self.element = None;
166
167        let parent = &self.doc.nodes[self.parent.get()];
168
169        if let Some(element) = parent.element {
170            let namespace = self.doc.namespaces.find(prefix)?;
171
172            let name = &self.doc.elements[element.get()].name;
173            let name_namespace = name.namespace;
174            let name_local = name.local;
175
176            if namespace != name_namespace || !cmp_names(local, name_local) {
177                return ErrorKind::UnexpectedCloseElement.into();
178            }
179        }
180
181        self.subtree.push(self.parent);
182
183        if let Some(ancestor) = parent.parent {
184            self.parent = ancestor;
185        } else {
186            return ErrorKind::UnexpectedCloseElement.into();
187        }
188
189        self.doc.namespaces.pop(tokenizer.element_depth());
190
191        Ok(())
192    }
193
194    pub(crate) fn close_open_element(&mut self) -> Result {
195        let (attributes_start, attributes_len) = self.resolve_attributes()?;
196
197        let Some(element) = self.element.take() else {
198            return ErrorKind::UnexpectedCloseElement.into();
199        };
200
201        let namespace = self.doc.namespaces.find(element.prefix)?;
202
203        let id = self.append_element_node(ElementData {
204            name: NameData {
205                namespace,
206                local: element.local,
207            },
208            attributes_start,
209            attributes_len,
210        })?;
211
212        self.parent = id;
213
214        Ok(())
215    }
216
217    fn append_node(&mut self, element: Option<NodeId>, text: Option<NodeId>) -> Result<NodeId> {
218        let new_id = NodeId::new(self.doc.nodes.len())?;
219
220        let prev_sibling = self.doc.nodes[self.parent.get()].last_child.replace(new_id);
221
222        self.doc.nodes.push(NodeData {
223            element,
224            text,
225            parent: Some(self.parent),
226            prev_sibling,
227            next_subtree: None,
228            last_child: None,
229        });
230
231        for &id in &self.subtree {
232            self.doc.nodes[id.get()].next_subtree = Some(new_id);
233        }
234
235        self.subtree.clear();
236
237        Ok(new_id)
238    }
239
240    fn append_element_node(&mut self, element: ElementData<'input>) -> Result<NodeId> {
241        let id = NodeId::new(self.doc.elements.len())?;
242
243        self.doc.elements.push(element);
244
245        self.append_node(Some(id), None)
246    }
247
248    fn append_text_node(&mut self, text: NodeId) -> Result<()> {
249        let id = self.append_node(None, Some(text))?;
250
251        self.subtree.push(id);
252
253        Ok(())
254    }
255
256    pub(crate) fn append_text(
257        &mut self,
258        tokenizer: &mut Tokenizer<'input>,
259        text: &'input str,
260    ) -> Result {
261        let pos = memchr2(b'&', b'\r', text.as_bytes());
262
263        if pos.is_none() {
264            let text = self.doc.strings.borrowed(text)?;
265
266            self.append_text_node(text)?;
267            return Ok(());
268        }
269
270        self.append_text_impl(tokenizer, text, pos)
271    }
272
273    #[cold]
274    #[inline(never)]
275    fn append_text_impl(
276        &mut self,
277        tokenizer: &mut Tokenizer<'input>,
278        mut text: &'input str,
279        mut pos: Option<usize>,
280    ) -> Result {
281        let mut strings = self.doc.strings.take();
282        let mut buf = StringBuf::new(&mut strings, text.len());
283
284        let mut was_cr = false;
285
286        while let Some(pos1) = pos {
287            let (before, after) = text.split_at(pos1);
288            buf.push_str(before);
289            text = after;
290
291            match text.as_bytes() {
292                [b'\r', b'\n', ..] => {
293                    buf.push('\n');
294                    text = &text[2..];
295                }
296                [b'\r', ..] => {
297                    buf.push('\n');
298                    text = &text[1..];
299                }
300                _ => {
301                    text = &text[1..];
302
303                    let ref_ =
304                        tokenizer.with_text(&mut text, |tokenizer| tokenizer.parse_reference())?;
305
306                    match ref_ {
307                        Reference::Char(char_) => {
308                            let is_entity = self.entity_depth != 0;
309
310                            match char_ {
311                                '\r' if is_entity => {
312                                    buf.push('\n');
313                                    was_cr = true;
314                                }
315                                '\n' if is_entity => {
316                                    if !was_cr {
317                                        buf.push('\n');
318                                    }
319                                    was_cr = false;
320                                }
321                                char_ => {
322                                    buf.push(char_);
323                                    was_cr = false;
324                                }
325                            }
326                        }
327                        Reference::Entity(name) => {
328                            let mut value = self.find_entity(name)?;
329
330                            if !buf.is_empty() {
331                                let text = buf.finish()?;
332
333                                self.append_text_node(text)?;
334                            }
335
336                            self.doc.strings = strings;
337
338                            self.open_entity()?;
339                            let element = self.element.take();
340
341                            tokenizer
342                                .with_text(&mut value, |tokenizer| tokenizer.parse_content(self))?;
343
344                            self.element = element;
345                            self.close_entity();
346
347                            strings = self.doc.strings.take();
348                            buf = StringBuf::new(&mut strings, 0);
349                        }
350                    }
351                }
352            }
353
354            pos = memchr2(b'&', b'\r', text.as_bytes());
355        }
356
357        buf.push_str(text);
358
359        if !buf.is_empty() {
360            let text = buf.finish()?;
361
362            self.append_text_node(text)?;
363        }
364
365        self.doc.strings = strings;
366
367        Ok(())
368    }
369
370    pub(crate) fn append_cdata(&mut self, cdata: &'input str) -> Result {
371        let pos = memchr(b'\r', cdata.as_bytes());
372
373        if pos.is_none() {
374            let text = self.doc.strings.borrowed(cdata)?;
375
376            self.append_text_node(text)?;
377            return Ok(());
378        }
379
380        self.append_cdata_impl(cdata, pos)
381    }
382
383    #[cold]
384    #[inline(never)]
385    fn append_cdata_impl(&mut self, mut cdata: &'input str, mut pos: Option<usize>) -> Result {
386        let mut buf = StringBuf::new(&mut self.doc.strings, cdata.len());
387
388        while let Some(pos1) = pos {
389            let (line, rest) = cdata.split_at(pos1);
390
391            buf.push_str(line);
392            buf.push('\n');
393
394            cdata = match rest.as_bytes().get(1) {
395                Some(&b'\n') => &rest[2..],
396                _ => &rest[1..],
397            };
398
399            pos = memchr(b'\r', cdata.as_bytes());
400        }
401
402        buf.push_str(cdata);
403
404        let text = buf.finish()?;
405
406        self.append_text_node(text)?;
407        Ok(())
408    }
409
410    fn normalize_attribute_value(
411        &mut self,
412        tokenizer: &mut Tokenizer<'input>,
413        value: &'input str,
414    ) -> Result<NodeId> {
415        let pos = memchr4(b'&', b'\t', b'\r', b'\n', value.as_bytes());
416
417        if pos.is_none() {
418            return self.doc.strings.borrowed(value);
419        }
420
421        let mut strings = self.doc.strings.take();
422        let mut buf = StringBuf::new(&mut strings, value.len());
423
424        self.normalize_attribute_value_impl(tokenizer, value, pos, &mut buf)?;
425
426        let value = buf.finish()?;
427        self.doc.strings = strings;
428
429        Ok(value)
430    }
431
432    #[cold]
433    #[inline(never)]
434    fn normalize_attribute_value_impl(
435        &mut self,
436        tokenizer: &mut Tokenizer<'input>,
437        mut value: &'input str,
438        mut pos: Option<usize>,
439        buf: &mut StringBuf<'_, 'input>,
440    ) -> Result {
441        while let Some(pos1) = pos {
442            let (before, after) = value.split_at(pos1);
443            buf.push_str(before);
444            value = after;
445
446            match value.as_bytes() {
447                [b'\r', b'\n', ..] => {
448                    buf.push(' ');
449                    value = &value[2..];
450                }
451                [b'\t' | b'\r' | b'\n', ..] => {
452                    buf.push(' ');
453                    value = &value[1..];
454                }
455                _ => {
456                    value = &value[1..];
457
458                    let ref_ =
459                        tokenizer.with_text(&mut value, |tokenizer| tokenizer.parse_reference())?;
460
461                    match ref_ {
462                        Reference::Char(char_) => {
463                            let is_entity = self.entity_depth != 0;
464
465                            let char_ = match char_ {
466                                '\t' | '\r' | '\n' if is_entity => ' ',
467                                char_ => char_,
468                            };
469
470                            buf.push(char_);
471                        }
472                        Reference::Entity(name) => {
473                            let value = self.find_entity(name)?;
474
475                            let pos = memchr4(b'&', b'\t', b'\r', b'\n', value.as_bytes());
476
477                            if pos.is_none() {
478                                buf.push_str(value);
479                            } else {
480                                self.open_entity()?;
481
482                                self.normalize_attribute_value_impl(tokenizer, value, pos, buf)?;
483
484                                self.close_entity();
485                            }
486                        }
487                    }
488                }
489            }
490
491            pos = memchr4(b'&', b'\t', b'\r', b'\n', value.as_bytes());
492        }
493
494        buf.push_str(value);
495
496        Ok(())
497    }
498
499    fn resolve_attributes(&mut self) -> Result<(u32, u16)> {
500        let start = self.doc.attributes.len();
501        let len = self.attributes.len();
502
503        if start > u32::MAX as usize || len > u16::MAX as usize {
504            return ErrorKind::TooManyAttributes.into();
505        }
506
507        for attribute in &self.attributes {
508            let namespace = if attribute.prefix.is_none() {
509                None
510            } else {
511                self.doc.namespaces.find(attribute.prefix)?
512            };
513
514            self.doc.attributes.push(AttributeData {
515                name: NameData {
516                    namespace,
517                    local: attribute.local,
518                },
519                value: attribute.value,
520            });
521        }
522
523        self.attributes.clear();
524
525        Ok((start as u32, len as u16))
526    }
527
528    pub(crate) fn push_entity(&mut self, name: &'input str, value: &'input str) {
529        if let Err(idx) = self
530            .entities
531            .binary_search_by_key(&name, |entity| entity.name)
532        {
533            self.entities.insert(idx, Entity { name, value });
534        }
535    }
536
537    fn find_entity(&self, name: &'input str) -> Result<&'input str> {
538        match self
539            .entities
540            .binary_search_by_key(&name, |entity| entity.name)
541        {
542            Ok(idx) => Ok(self.entities[idx].value),
543            Err(_idx) => ErrorKind::UnknownEntity(name.to_owned()).into(),
544        }
545    }
546
547    fn open_entity(&mut self) -> Result {
548        if self.entity_depth != 0 {
549            if self.entity_breadth == 100 {
550                return ErrorKind::TooManyEntityReferences.into();
551            }
552
553            self.entity_breadth += 1;
554        }
555
556        if self.entity_depth == 10 {
557            return ErrorKind::TooManyEntityReferences.into();
558        }
559
560        self.entity_depth += 1;
561
562        Ok(())
563    }
564
565    fn close_entity(&mut self) {
566        if self.entity_depth != 0 {
567            self.entity_depth -= 1;
568        }
569
570        if self.entity_depth == 0 {
571            self.entity_breadth = 0;
572        }
573    }
574}