yam_core/tokenizer/
iterator.rs

1use alloc::borrow::Cow;
2use alloc::vec::Vec;
3use alloc::string::String;
4use core::fmt::{Display, Formatter};
5use core::marker::PhantomData;
6
7use core::str::from_utf8_unchecked;
8
9use urlencoding::decode_binary;
10
11use crate::escaper::{escape_double_quotes, escape_plain, escape_single_quotes};
12use crate::tokenizer::iterator::Event::ErrorEvent;
13use crate::tokenizer::{Reader, Slicer};
14use crate::Lexer;
15
16use super::StrReader;
17
18///
19/// Iterator over events
20///
21/// It returns borrowed events that correspond to the
22/// It's generic over:
23/// `'a` - lifetime
24/// [R] - Reader
25/// [RB] - Reader Buffer
26/// [I] - Input Buffer (optional)
27pub struct EventIterator<'a, R, RB = &'a [u8], I = ()> {
28    /// Reader type that usually implements a [Reader] trait which takes a Buffer type [B]
29    pub(crate) reader: R,
30    pub(crate) buffer: RB,
31    /// Lexer which controls current state of parsing
32    pub(crate) state: Lexer,
33    /// Tag of current node,
34    pub(crate) tag: Option<Cow<'a, [u8]>>,
35    /// Alias of current node,
36    pub(crate) anchor: Option<Cow<'a, [u8]>>,
37    /// Helper to store the unconstrained types
38    phantom: PhantomData<(&'a I, RB)>,
39}
40
41impl<'a> From<&'a str> for EventIterator<'a, StrReader<'a>, &'a [u8]> {
42    fn from(value: &'a str) -> Self {
43        EventIterator {
44            reader: StrReader::from(value),
45            state: Lexer::default(),
46            buffer: value.as_bytes(),
47            tag: None,
48            anchor: None,
49            phantom: PhantomData,
50        }
51    }
52}
53
54impl<'a> From<&'a [u8]> for EventIterator<'a, StrReader<'a>, &'a [u8]> {
55    fn from(value: &'a [u8]) -> Self {
56        EventIterator {
57            reader: StrReader::from(value),
58            state: Lexer::default(),
59            buffer: value,
60            tag: None,
61            anchor: None,
62            phantom: PhantomData,
63        }
64    }
65}
66
67#[derive(Copy, Clone, PartialEq, Debug)]
68pub enum ScalarType {
69    Plain,
70    Folded,
71    Literal,
72    SingleQuote,
73    DoubleQuote,
74}
75
76#[derive(Copy, Clone, PartialEq)]
77pub enum DirectiveType {
78    Yaml,
79    Tag,
80    Reserved,
81}
82
83#[derive(Clone, PartialEq)]
84pub enum Event<'a> {
85    DocStart {
86        explicit: bool,
87    },
88    DocEnd {
89        explicit: bool,
90    },
91    SeqStart {
92        tag: Option<Cow<'a, [u8]>>,
93        anchor: Option<Cow<'a, [u8]>>,
94        flow: bool,
95    },
96    SeqEnd,
97    MapStart {
98        tag: Option<Cow<'a, [u8]>>,
99        anchor: Option<Cow<'a, [u8]>>,
100        flow: bool,
101    },
102    MapEnd,
103    Directive {
104        directive_type: DirectiveType,
105        value: Cow<'a, [u8]>,
106    },
107    Scalar {
108        tag: Option<Cow<'a, [u8]>>,
109        anchor: Option<Cow<'a, [u8]>>,
110        scalar_type: ScalarType,
111        value: Cow<'a, [u8]>,
112    },
113    Alias(Cow<'a, [u8]>),
114    ErrorEvent,
115}
116
117impl<'a> Display for Event<'a> {
118    fn fmt(&self, f: &mut Formatter<'_>) -> core::fmt::Result {
119        match self {
120            Event::DocStart { explicit } => {
121                let exp_str = if *explicit { " ---" } else { "" };
122                write!(f, "+DOC{exp_str}")
123            }
124            Event::DocEnd { explicit } => {
125                let exp_str = if *explicit { " ..." } else { "" };
126                write!(f, "-DOC{exp_str}")
127            }
128            Event::SeqStart { flow, tag, anchor } => {
129                write!(f, "+SEQ",)?;
130                if *flow {
131                    write!(f, " []")?;
132                }
133                if let Some(cow) = anchor {
134                    let string = unsafe { from_utf8_unchecked(cow.as_ref()) };
135                    write!(f, " &{string}")?;
136                };
137                if let Some(cow) = tag {
138                    let string = unsafe { from_utf8_unchecked(cow.as_ref()) };
139                    write!(f, " <{string}>")?;
140                };
141                Ok(())
142            }
143            Event::SeqEnd => {
144                write!(f, "-SEQ")
145            }
146            Event::MapStart { flow, tag, anchor } => {
147                write!(f, "+MAP")?;
148                if *flow {
149                    write!(f, " {{}}")?;
150                }
151                if let Some(cow) = anchor {
152                    let string = unsafe { from_utf8_unchecked(cow.as_ref()) };
153                    write!(f, " &{string}")?;
154                };
155                if let Some(cow) = tag {
156                    let string = unsafe { from_utf8_unchecked(cow.as_ref()) };
157                    write!(f, " <{string}>")?;
158                };
159                Ok(())
160            }
161            Event::MapEnd => {
162                write!(f, "-MAP")
163            }
164            Event::Directive {
165                directive_type,
166                value,
167            } => {
168                let val_str = unsafe { from_utf8_unchecked(value.as_ref()) };
169                match directive_type {
170                    DirectiveType::Yaml => write!(f, "%YAML {val_str}"),
171                    _ => write!(f, "{val_str}"),
172                }
173            }
174            Event::Scalar {
175                scalar_type,
176                value,
177                tag,
178                anchor,
179            } => {
180                let val_str = unsafe { from_utf8_unchecked(value.as_ref()) };
181                write!(f, "=VAL")?;
182
183                if let Some(cow) = anchor {
184                    let string: &str = unsafe { from_utf8_unchecked(cow.as_ref()) };
185                    write!(f, " &{string}")?;
186                };
187                if let Some(cow) = tag {
188                    let string = unsafe { from_utf8_unchecked(cow.as_ref()) };
189                    write!(f, " <{string}>")?;
190                };
191                match *scalar_type {
192                    ScalarType::Plain => write!(f, " :"),
193                    ScalarType::Folded => write!(f, " >"),
194                    ScalarType::Literal => write!(f, " |"),
195                    ScalarType::SingleQuote => write!(f, " \'"),
196                    ScalarType::DoubleQuote => write!(f, " \""),
197                }?;
198                write!(f, "{val_str}")?;
199
200                Ok(())
201            }
202            ErrorEvent => {
203                write!(f, "ERR")
204            }
205            Event::Alias(value) => {
206                let val_str = unsafe { from_utf8_unchecked(value.as_ref()) };
207                write!(f, "=ALI *{val_str}")
208            }
209        }
210    }
211}
212
213impl<'a> Slicer<'a> for &'a [u8] {
214    fn slice(&self, start: usize, end: usize) -> &'a [u8] {
215        unsafe { self.get_unchecked(start..end) }
216    }
217}
218
219impl<'a, R, RB, B> Iterator for EventIterator<'a, R, RB, B>
220where
221    R: Reader<B>,
222    RB: Slicer<'a>,
223{
224    type Item = Event<'a>;
225
226    fn next(&mut self) -> Option<Self::Item> {
227        pub use crate::tokenizer::iterator::Event::*;
228        pub use crate::tokenizer::LexerToken::*;
229
230        loop {
231            if self.state.is_empty() && !self.state.stream_end {
232                self.state.fetch_next_token(&mut self.reader);
233            }
234
235            if let Some(x) = self.state.pop_token() {
236                let token = x.into();
237                match token {
238                    SequenceStart => {
239                        return Some(SeqStart {
240                            flow: true,
241                            tag: self.tag.take(),
242                            anchor: self.anchor.take(),
243                        });
244                    }
245                    SequenceStartImplicit => {
246                        return Some(SeqStart {
247                            flow: false,
248                            tag: self.tag.take(),
249                            anchor: self.anchor.take(),
250                        });
251                    }
252                    MappingStart => {
253                        return Some(MapStart {
254                            flow: true,
255                            tag: self.tag.take(),
256                            anchor: self.anchor.take(),
257                        });
258                    }
259                    MappingStartImplicit => {
260                        return Some(MapStart {
261                            flow: false,
262                            tag: self.tag.take(),
263                            anchor: self.anchor.take(),
264                        });
265                    }
266                    DocumentStart => {
267                        return Some(DocStart { explicit: false });
268                    }
269                    DocumentStartExplicit => {
270                        return Some(DocStart { explicit: true });
271                    }
272                    SequenceEnd => {
273                        return Some(SeqEnd);
274                    }
275                    MappingEnd => {
276                        return Some(MapEnd);
277                    }
278                    DocumentEnd => {
279                        return Some(DocEnd { explicit: false });
280                    }
281                    DocumentEndExplicit => {
282                        return Some(DocEnd { explicit: true });
283                    }
284                    ErrorToken => return Some(ErrorEvent),
285                    DirectiveReserved | DirectiveTag | DirectiveYaml => {
286                        let directive_type = unsafe { token.to_yaml_directive() };
287                        return if let (Some(start), Some(end)) =
288                            (self.state.pop_token(), self.state.pop_token())
289                        {
290                            let slice = Cow::Borrowed(self.buffer.slice(start, end));
291                            Some(Directive {
292                                directive_type,
293                                value: slice,
294                            })
295                        } else {
296                            panic!("Error in processing YAML file");
297                        };
298                    }
299                    ScalarPlain | ScalarLit | ScalarFold | ScalarDoubleQuote
300                    | ScalarSingleQuote | Mark => {
301                        // Safe if only one of these six
302                        let scalar_type = unsafe { token.to_scalar() };
303                        let mut cow: Cow<'a, [u8]> = Cow::default();
304                        loop {
305                            match (self.state.peek_token(), self.state.peek_token_next()) {
306                                (Some(start), Some(end))
307                                    if start < NewLine as usize && end < NewLine as usize =>
308                                {
309                                    if cow.is_empty() {
310                                        cow = Cow::Borrowed(self.buffer.slice(start, end));
311                                    } else {
312                                        cow.to_mut().extend(self.buffer.slice(start, end));
313                                    }
314                                    self.state.pop_token();
315                                    self.state.pop_token();
316                                }
317                                (Some(newline), Some(line)) if newline == NewLine as usize => {
318                                    if line == 0 {
319                                        cow.to_mut().extend(" ".as_bytes());
320                                    } else {
321                                        cow.to_mut().extend("\n".repeat(line).as_bytes());
322                                    }
323                                    self.state.pop_token();
324                                    self.state.pop_token();
325                                }
326                                (_, _) => {
327                                    break;
328                                }
329                            }
330                        }
331                        let cow = match scalar_type {
332                            ScalarType::Plain | ScalarType::Literal | ScalarType::Folded => {
333                                escape_plain(cow)
334                            }
335                            ScalarType::DoubleQuote => escape_double_quotes(cow),
336                            ScalarType::SingleQuote => escape_single_quotes(cow),
337                        };
338                        return Some(Scalar {
339                            scalar_type,
340                            value: cow,
341                            tag: self.tag.take(),
342                            anchor: self.anchor.take(),
343                        });
344                    }
345                    AliasToken => {
346                        if let (Some(start), Some(end)) =
347                            (self.state.pop_token(), self.state.pop_token())
348                        {
349                            return Some(Alias(Cow::Borrowed(self.buffer.slice(start, end))));
350                        }
351                    }
352                    AnchorToken => {
353                        if let (Some(start), Some(end)) =
354                            (self.state.pop_token(), self.state.pop_token())
355                        {
356                            self.anchor = Some(Cow::Borrowed(self.buffer.slice(start, end)));
357                        }
358                    }
359                    TagStart => {
360                        if let (Some(start), Some(mid), Some(end)) = (
361                            self.state.pop_token(),
362                            self.state.pop_token(),
363                            self.state.pop_token(),
364                        ) {
365                            let namespace = self.buffer.slice(start, mid);
366                            let extension = if end == 0 {
367                                b""
368                            } else {
369                                self.buffer.slice(mid, end)
370                            };
371                            self.tag = if let Some(&(e1, e2)) = self.state.tags.get(namespace) {
372                                let mut tag = Vec::from(self.buffer.slice(e1, e2));
373                                tag.extend_from_slice(extension);
374                                if tag.contains(&b'%') {
375                                    tag = decode_binary(&tag).into_owned();
376                                }
377                                Some(Cow::Owned(tag))
378                            } else if namespace == b"!!" && !extension.is_empty() {
379                                let mut cow: Cow<'_, [u8]> =
380                                    Cow::Owned(b"tag:yaml.org,2002:".to_vec());
381                                cow.to_mut().extend(extension);
382                                Some(cow)
383                            } else if namespace == b"!" {
384                                let mut cow: Cow<'_, [u8]> = Cow::Owned(b"!".to_vec());
385                                cow.to_mut().extend(extension);
386                                Some(cow)
387                            } else if extension.is_empty() && end == 0 {
388                                Some(Cow::Borrowed(namespace))
389                            } else {
390                                return Some(Event::ErrorEvent);
391                            }
392                        }
393                    }
394                    NewLine | ScalarEnd => {}
395                }
396            }
397            if self.state.stream_end && self.state.is_empty() {
398                return None;
399            }
400        }
401    }
402}
403
404pub fn assert_eq_event(input: &str, events: &str) {
405    use core::fmt::Write;
406    
407    let mut line = String::with_capacity(events.as_bytes().len());
408    let scan: EventIterator<'_, StrReader, _> = EventIterator::from(input);
409    scan.for_each(|ev| {
410        line.push('\n');
411        write!(line, "{ev:}").unwrap();
412    });
413
414    assert_eq!(line, events, "Error in {input}");
415}