welly_parser/
lexer.rs

1//! Welly's lexer.
2
3use super::{Tree, Stream, Context, Parse};
4
5/// Represents a line comment or a block comment.
6///
7/// A line comment begins with `//` and ends before a newline.
8/// A block comment begins with `/*` and ends with `*/`.
9/// The text of the comment can only be retrieved if you know its [`Location`].
10///
11/// [`Location`]: super::Location
12#[derive(Debug, Clone, PartialEq)]
13pub struct Comment;
14
15impl Tree for Comment {}
16
17/// Represents a Welly character literal.
18///
19/// A character literal consists of a single character or escape sequence
20/// enclosed in ASCII `'` characters.
21#[derive(Debug, Clone, PartialEq)]
22pub struct CharacterLiteral(pub char);
23
24impl Tree for CharacterLiteral {}
25
26/// Represents a Welly string literal.
27///
28/// A character literal consists of zero or more characters or escape sequences
29/// enclosed in ASCII `"` characters.
30#[derive(Debug, Clone, PartialEq)]
31pub struct StringLiteral(pub String);
32
33impl Tree for StringLiteral {}
34
35pub const UNTERMINATED_BLOCK_COMMENT: &'static str = "Unterminated block comment";
36pub const UNTERMINATED_STRING: &'static str = "Unterminated string";
37pub const MISSING_CHAR: &'static str = "Missing character literal";
38pub const UNTERMINATED_CHAR: &'static str = "Unterminated character literal";
39pub const BAD_ESCAPE: &'static str = "Unexpected escape sequence";
40pub const MISSING_SEQUENCE: &'static str = "Missing escape sequence";
41pub const MISSING_HEX: &'static str = "Expected a hex digit";
42pub const INVALID: &'static str = "Invalid unicode scalar value";
43
44/// If `c` is a hexadecimal digit, return its numeric value.
45fn hex_digit_value(c: char) -> Option<u32> {
46    match c {
47        '0'..='9' => Some((c as u32) - ('0' as u32)),
48        'A'..='F' => Some((c as u32) - ('A' as u32) + 10),
49        'a'..='f' => Some((c as u32) - ('a' as u32) + 10),
50        _ => None,
51    }
52}
53
54// ----------------------------------------------------------------------------
55
56/// A [`Parse`] implementation that recognises [`Comment`]s,
57/// [`CharacterLiteral`]s and [`StringLiteral`]s.
58///
59/// It parses a [`Stream`] that contains [`char`]s.
60#[derive(Debug, Default)]
61pub struct Parser;
62
63impl Parser {
64    /// Parse a line comment, starting after the initial `//`.
65    fn parse_line_comment(
66        &self,
67        input: &mut Context<impl Stream>,
68    ) -> Result<Box<dyn Tree>, String> {
69        while input.read_if::<char>(|&c| c != '\n')?.is_some() {}
70        Ok(Box::new(Comment))
71    }
72
73    /// Parse a line comment, starting after the initial `/*`.
74    fn parse_block_comment(
75        &self,
76        input: &mut Context<impl Stream>,
77    ) -> Result<Box<dyn Tree>, String> {
78        loop {
79            if let Some(c) = input.read::<char>()? {
80                if *c == '*' {
81                    if input.read_if::<char>(|&c| c == '/')?.is_some() { break; }
82                }
83            } else {
84                // E.g. `EndOfFile`.
85                Err(UNTERMINATED_BLOCK_COMMENT)?
86            }
87        }
88        Ok(Box::new(Comment))
89    }
90
91    /// Parse `num_digits` hexadecimal digits.
92    fn parse_hex(
93        &self,
94        input: &mut Context<impl Stream>,
95        num_digits: usize,
96    ) -> Result<char, String> {
97        let mut ret: u32 = 0;
98        for i in 0..num_digits {
99            if let Some(c) = input.read::<char>()? {
100                if let Some(d) = hex_digit_value(*c) {
101                    ret |= d << (i * 4);
102                } else {
103                    // `c` is not a digit.
104                    input.unread(c);
105                    Err(MISSING_HEX)?
106                }
107            } else {
108                // E.g. `EndOfFile`.
109                Err(MISSING_HEX)?
110            }
111        }
112        char::from_u32(ret).ok_or_else(|| INVALID.into())
113    }
114
115    /// Parse a single character or an escape sequence.
116    /// - if_missing - the error message if we don't receive a character.
117    /// Returns:
118    /// - the `char` value.
119    /// - `true` if it was escaped.
120    fn parse_char(
121        &self,
122        input: &mut Context<impl Stream>,
123        if_missing: &'static str,
124    ) -> Result<(char, bool), String> {
125        if let Some(c) = input.read::<char>()? {
126            if *c == '\n' { input.unread(c); return Err(if_missing)? }
127            if *c != '\\' { return Ok((*c, false)); }
128        } else {
129            Err(if_missing)?
130        }
131        // We've read a backslash.
132        if let Some(c) = input.read::<char>()? {
133            match *c {
134                '0' => { return Ok(('\0', true)) },
135                't' => { return Ok(('\t', true)) },
136                'n' => { return Ok(('\n', true)) },
137                'r' => { return Ok(('\r', true)) },
138                '"' => { return Ok(('"', true)) },
139                '\'' => { return Ok(('\'', true)) },
140                '\\' => { return Ok(('\\', true)) },
141                'x' => { return Ok((self.parse_hex(input, 2)?, true)) },
142                'u' => { return Ok((self.parse_hex(input, 4)?, true)) },
143                _ => { input.unread(c); },
144            }
145        }
146        Err(MISSING_SEQUENCE)?
147        
148    }
149
150    /// Discards characters from `input` up to the next `end` or newline.
151    fn skip_until(
152        &self,
153        input: &mut Context<impl Stream>,
154        end: char,
155    ) -> Result<(), String> {
156        while let Some(c) = input.read::<char>()? {
157            let _ = input.pop();
158            if *c == '\n' || *c == end { break; }
159        }
160        Ok(())
161    }
162
163    /// Parse a character literal, starting after the initial `'`.
164    fn parse_character_literal(
165        &self,
166        input: &mut Context<impl Stream>,
167    ) -> Result<Box<dyn Tree>, String> {
168        let (c, is_escaped) = self.parse_char(input, MISSING_CHAR).or_else(|e| {
169            self.skip_until(input, '\'')?;
170            Err(e)
171        })?;
172        if c == '\'' && !is_escaped { Err(MISSING_CHAR)? }
173        if let Some(c2) = input.read::<char>()? {
174            if *c2 != '\'' { input.unread(c2); Err(UNTERMINATED_CHAR)? }
175        } else {
176            Err(UNTERMINATED_CHAR)?
177        }
178        Ok(Box::new(CharacterLiteral(c)))
179    }
180
181    /// Parse a string literal, starting after the initial `"`.
182    fn parse_string_literal(
183        &self,
184        input: &mut Context<impl Stream>,
185    ) -> Result<Box<dyn Tree>, String> {
186        let mut s = String::new();
187        loop {
188            let (c, is_escaped) = self.parse_char(input, UNTERMINATED_STRING).or_else(|e| {
189                self.skip_until(input, '\"')?;
190                Err(e)
191            })?;
192            if c == '"' && !is_escaped { break; }
193            s.push(c);
194        }
195        s.shrink_to_fit();
196        Ok(Box::new(StringLiteral(s)))
197    }
198}
199
200impl Parse for Parser {
201    fn parse(
202        &self,
203        input: &mut Context<impl Stream>,
204    ) -> Result<Box<dyn Tree>, String> {
205        if let Some(c) = input.read::<char>()? {
206            match *c {
207                '/' => if let Some(c2) = input.read::<char>()? {
208                    match *c2 {
209                        '/' => { return self.parse_line_comment(input); },
210                        '*' => { return self.parse_block_comment(input); },
211                        _ => { input.unread(c2); },
212                    }
213                },
214                '\'' => { return self.parse_character_literal(input); },
215                '\"' => { return self.parse_string_literal(input); },
216                _ => {},
217            }
218            Ok(c)
219        } else {
220            // E.g. end of file.
221            input.read_any()
222        }
223    }
224}
225
226// ----------------------------------------------------------------------------
227
228#[cfg(test)]
229mod tests {
230    use super::*;
231    use crate::{EndOfFile, Characters};
232
233    #[test]
234    fn line_comment() {
235        let mut stream = Parser.parse_stream(Characters::new("a // b\nc", true));
236        assert_eq!(stream.read(), 'a');
237        assert_eq!(stream.read(), ' ');
238        assert_eq!(stream.read(), Comment);
239        assert_eq!(stream.read(), '\n');
240        assert_eq!(stream.read(), 'c');
241        assert_eq!(stream.read(), EndOfFile);
242    }
243
244    #[test]
245    fn line_comment_eof() {
246        let mut stream = Parser.parse_stream(Characters::new("a // b", true));
247        assert_eq!(stream.read(), 'a');
248        assert_eq!(stream.read(), ' ');
249        assert_eq!(stream.read(), Comment);
250        assert_eq!(stream.read(), EndOfFile);
251    }
252
253    #[test]
254    fn block_comment() {
255        let mut stream = Parser.parse_stream(Characters::new("a /* b */", true));
256        assert_eq!(stream.read(), 'a');
257        assert_eq!(stream.read(), ' ');
258        assert_eq!(stream.read(), Comment);
259        assert_eq!(stream.read(), EndOfFile);
260    }
261
262    #[test]
263    fn block_comment_eof() {
264        let mut stream = Parser.parse_stream(Characters::new("a /* b", true));
265        assert_eq!(stream.read(), 'a');
266        assert_eq!(stream.read(), ' ');
267        assert_eq!(stream.read().unwrap_err(), UNTERMINATED_BLOCK_COMMENT);
268        assert_eq!(stream.read(), EndOfFile);
269    }
270
271    #[test]
272    fn escapes() {
273        let mut stream = Parser.parse_stream(Characters::new("f(\"h\\\"w\\\"!\", '\\n')", true));
274        assert_eq!(stream.read(), 'f');
275        assert_eq!(stream.read(), '(');
276        assert_eq!(stream.read(), StringLiteral("h\"w\"!".into()));
277        assert_eq!(stream.read(), ',');
278        assert_eq!(stream.read(), ' ');
279        assert_eq!(stream.read(), CharacterLiteral('\n'));
280        assert_eq!(stream.read(), ')');
281        assert_eq!(stream.read(), EndOfFile);
282    }
283
284    #[test]
285    fn bad_newline() {
286        let mut stream = Parser.parse_stream(Characters::new("'\n'", true));
287        assert_eq!(stream.read().unwrap_err(), MISSING_CHAR);
288        assert_eq!(stream.read().unwrap_err(), MISSING_CHAR);
289        assert_eq!(stream.read(), EndOfFile);
290    }
291
292    #[test]
293    fn bad_char() {
294        let mut stream = Parser.parse_stream(Characters::new("'\\j'", true));
295        assert_eq!(stream.read().unwrap_err(), MISSING_SEQUENCE);
296        assert_eq!(stream.read(), EndOfFile);
297    }
298
299    #[test]
300    fn bad_str() {
301        let mut stream = Parser.parse_stream(Characters::new("\"a\\j\"", true));
302        assert_eq!(stream.read().unwrap_err(), MISSING_SEQUENCE);
303        assert_eq!(stream.read(), EndOfFile);
304    }
305
306    #[test]
307    fn empty_char() {
308        let mut stream = Parser.parse_stream(Characters::new("''", true));
309        assert_eq!(stream.read().unwrap_err(), MISSING_CHAR);
310        assert_eq!(stream.read(), EndOfFile);
311    }
312
313    #[test]
314    fn double_char() {
315        let mut stream = Parser.parse_stream(Characters::new("'ab'", true));
316        assert_eq!(stream.read().unwrap_err(), UNTERMINATED_CHAR);
317        assert_eq!(stream.read(), 'b');
318        assert_eq!(stream.read().unwrap_err(), MISSING_CHAR);
319        assert_eq!(stream.read(), EndOfFile);
320    }
321}