sise/
parser.rs

1use crate::is_atom_chr;
2use crate::is_atom_string_chr;
3
4#[derive(Copy, Clone, Debug, PartialEq, Eq)]
5pub enum ParsedItem<'a> {
6    /// An atom
7    ///
8    /// The `usize` specifies its byte offset in the input file
9    Atom(&'a str, usize),
10    /// The start of a list (`(`)
11    ///
12    /// The `usize` specifies its byte offset in the input file
13    ListStart(usize),
14    /// The end of a list (`)`)
15    ///
16    /// The `usize` specifies its byte offset in the input file
17    ListEnd(usize),
18}
19
20/// Represents a parse error.
21#[derive(Clone, Debug, PartialEq, Eq)]
22pub enum ParseError {
23    /// There is an invalid character
24    IllegalChr { pos: usize, chr: char },
25
26    /// There is an invalid character inside a string (enclosed with `"`)
27    IllegalChrInString { pos: usize, chr: char },
28
29    /// There is an invalid character inside a comment
30    IllegalChrInComment { pos: usize, chr: char },
31
32    /// End-of-file is reached before finding the closing `"`
33    UnfinishedString { pos: usize },
34
35    /// Unexpected end-of-file
36    UnexpectedEof { pos: usize },
37
38    /// Unexpected `)`
39    UnexpectedRightParen { pos: usize },
40
41    /// Found a token when expecting end-of-file
42    ExpectedEof { pos: usize },
43}
44
45impl core::fmt::Display for ParseError {
46    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
47        match *self {
48            ParseError::IllegalChr { pos, chr } => {
49                write!(f, "illegal character {:?} at byte {}", chr, pos)
50            }
51            ParseError::IllegalChrInString { pos, chr } => {
52                write!(f, "illegal character {:?} in string at byte {}", chr, pos)
53            }
54            ParseError::IllegalChrInComment { pos, chr } => {
55                write!(f, "illegal character {:?} in comment at byte {}", chr, pos)
56            }
57            ParseError::UnfinishedString { pos } => write!(f, "unfinished string at byte {}", pos),
58            ParseError::UnexpectedEof { pos } => {
59                write!(f, "unexpected end-of-file at byte {}", pos)
60            }
61            ParseError::UnexpectedRightParen { pos } => {
62                write!(f, "unexpected `)` at byte {}", pos)
63            }
64            ParseError::ExpectedEof { pos } => write!(f, "expected end-of-file at byte {}", pos),
65        }
66    }
67}
68
69#[cfg(feature = "std")]
70impl std::error::Error for ParseError {}
71
72/// Parser that decodes a SISE file into a sequence of [`ParsedItem`].
73///
74/// # Example
75///
76/// ```
77/// let data = "(test (1 2 3))";
78/// let mut parser = sise::Parser::new(data);
79/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::ListStart(0));
80/// assert_eq!(
81///     parser.next_item().unwrap(),
82///     sise::ParsedItem::Atom("test", 1),
83/// );
84/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::ListStart(6));
85/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::Atom("1", 7));
86/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::Atom("2", 9));
87/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::Atom("3", 11));
88/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::ListEnd(12));
89/// assert_eq!(parser.next_item().unwrap(), sise::ParsedItem::ListEnd(13));
90/// parser.finish().unwrap();
91/// ```
92pub struct Parser<'a> {
93    lexer: Lexer<'a>,
94    state: State,
95}
96
97enum State {
98    Beginning,
99    Parsing { depth: usize },
100    Finishing,
101}
102
103impl<'a> Parser<'a> {
104    pub fn new(data: &'a str) -> Self {
105        Self {
106            lexer: Lexer::new(data),
107            state: State::Beginning,
108        }
109    }
110
111    pub fn next_item(&mut self) -> Result<ParsedItem<'a>, ParseError> {
112        match self.state {
113            State::Beginning => {
114                let (pos, token) = self.lexer.get_token()?;
115                match token {
116                    Token::Eof => Err(ParseError::UnexpectedEof { pos }),
117                    Token::LeftParen => {
118                        self.state = State::Parsing { depth: 0 };
119                        Ok(ParsedItem::ListStart(pos))
120                    }
121                    Token::RightParen => Err(ParseError::UnexpectedRightParen { pos }),
122                    Token::Atom(atom) => {
123                        self.state = State::Finishing;
124                        Ok(ParsedItem::Atom(atom, pos))
125                    }
126                }
127            }
128            State::Parsing { ref mut depth } => {
129                let (pos, token) = self.lexer.get_token()?;
130                match token {
131                    Token::Eof => Err(ParseError::UnexpectedEof { pos }),
132                    Token::LeftParen => {
133                        *depth += 1;
134                        Ok(ParsedItem::ListStart(pos))
135                    }
136                    Token::RightParen => {
137                        if *depth == 0 {
138                            self.state = State::Finishing;
139                        } else {
140                            *depth -= 1;
141                        }
142                        Ok(ParsedItem::ListEnd(pos))
143                    }
144                    Token::Atom(atom) => Ok(ParsedItem::Atom(atom, pos)),
145                }
146            }
147            State::Finishing => panic!("parsing finished"),
148        }
149    }
150
151    pub fn finish(mut self) -> Result<(), ParseError> {
152        match self.state {
153            State::Finishing => {
154                let (pos, token) = self.lexer.get_token()?;
155                match token {
156                    Token::Eof => Ok(()),
157                    _ => Err(ParseError::ExpectedEof { pos }),
158                }
159            }
160            _ => panic!("parsing not finished yet"),
161        }
162    }
163}
164
165#[derive(Clone, Debug, PartialEq, Eq)]
166enum Token<'a> {
167    Eof,
168    LeftParen,
169    RightParen,
170    Atom(&'a str),
171}
172
173struct Lexer<'a> {
174    rem_input: &'a str,
175    rem_offset: usize,
176}
177
178impl<'a> Lexer<'a> {
179    fn new(input: &'a str) -> Self {
180        Lexer {
181            rem_input: input,
182            rem_offset: 0,
183        }
184    }
185
186    #[must_use]
187    #[inline]
188    fn eat_any_char(&mut self) -> Option<char> {
189        let mut iter = self.rem_input.chars();
190        if let Some(chr) = iter.next() {
191            let new_rem = iter.as_str();
192            self.rem_offset += self.rem_input.len() - new_rem.len();
193            self.rem_input = new_rem;
194            Some(chr)
195        } else {
196            None
197        }
198    }
199
200    #[must_use]
201    #[inline]
202    fn eat_char(&mut self, chr: char) -> bool {
203        if let Some(new_rem) = self.rem_input.strip_prefix(chr) {
204            self.rem_offset += self.rem_input.len() - new_rem.len();
205            self.rem_input = new_rem;
206            true
207        } else {
208            false
209        }
210    }
211
212    #[must_use]
213    #[inline]
214    fn eat_char_if(&mut self, pred: impl FnMut(char) -> bool) -> bool {
215        if let Some(new_rem) = self.rem_input.strip_prefix(pred) {
216            self.rem_offset += self.rem_input.len() - new_rem.len();
217            self.rem_input = new_rem;
218            true
219        } else {
220            false
221        }
222    }
223
224    fn get_token(&mut self) -> Result<(usize, Token<'a>), ParseError> {
225        loop {
226            let start_str = self.rem_input;
227            let chr_pos = self.rem_offset;
228            if self.eat_char(' ')
229                || self.eat_char('\t')
230                || self.eat_char('\n')
231                || self.eat_char('\r')
232            {
233                // skip whitespace
234            } else if self.eat_char(';') {
235                // skip comments
236                loop {
237                    let chr_pos = self.rem_offset;
238                    match self.eat_any_char() {
239                        None => return Ok((self.rem_offset, Token::Eof)),
240                        Some('\n' | '\r') => break,
241                        Some('\t' | ' '..='~') => {}
242                        Some(chr) => {
243                            return Err(ParseError::IllegalChrInComment { chr, pos: chr_pos });
244                        }
245                    }
246                }
247            } else if self.eat_char('(') {
248                return Ok((chr_pos, Token::LeftParen));
249            } else if self.eat_char(')') {
250                return Ok((chr_pos, Token::RightParen));
251            } else if let Some(chr) = self.eat_any_char() {
252                if is_atom_chr(chr) || chr == '"' {
253                    let begin_pos = chr_pos;
254                    let end_pos = self.lex_atom(chr)?;
255                    let atom = &start_str[..(end_pos - begin_pos)];
256                    return Ok((begin_pos, Token::Atom(atom)));
257                } else {
258                    // invalid character
259                    return Err(ParseError::IllegalChr { chr, pos: chr_pos });
260                }
261            } else {
262                // end-of-file
263                return Ok((self.rem_offset, Token::Eof));
264            }
265        }
266    }
267
268    fn lex_atom(&mut self, first_chr: char) -> Result<usize, ParseError> {
269        let mut in_string = first_chr == '"';
270        loop {
271            let chr_pos = self.rem_offset;
272            if in_string {
273                if self.eat_char('"') {
274                    in_string = false;
275                } else if self.eat_char('\\') {
276                    let chr_pos = self.rem_offset;
277                    if let Some(chr) = self.eat_any_char() {
278                        if chr != '"' && chr != '\\' && !is_atom_string_chr(chr) {
279                            return Err(ParseError::IllegalChrInString { chr, pos: chr_pos });
280                        }
281                    } else {
282                        return Err(ParseError::UnfinishedString { pos: chr_pos });
283                    }
284                } else if let Some(chr) = self.eat_any_char() {
285                    if !is_atom_string_chr(chr) {
286                        return Err(ParseError::IllegalChrInString { chr, pos: chr_pos });
287                    }
288                } else {
289                    return Err(ParseError::UnfinishedString { pos: chr_pos });
290                }
291            } else if self.eat_char('"') {
292                in_string = true;
293            } else if !self.eat_char_if(is_atom_chr) {
294                return Ok(chr_pos);
295            }
296        }
297    }
298}