kast_ast/
lexer.rs

1// We are finally making some progress
2use super::*;
3use std::collections::HashMap;
4
5pub type Result<T, E = ErrorMessage> = std::result::Result<T, E>;
6
7#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
8pub enum StringType {
9    SingleQuoted,
10    DoubleQuoted,
11}
12
13#[allow(dead_code)]
14#[derive(Debug, Clone, PartialEq, Eq, Hash)]
15pub enum Token {
16    Ident {
17        raw: String,
18        name: String,
19        is_raw: bool,
20    },
21    Punctuation {
22        raw: String,
23    },
24    String {
25        raw: String,
26        contents: String,
27        typ: StringType,
28    },
29    Number {
30        raw: String,
31    },
32    Comment {
33        raw: String,
34        contents: String,
35    },
36    Eof,
37}
38
39impl std::fmt::Display for Token {
40    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
41        write!(f, "{:?}", self.raw())
42    }
43}
44
45impl Token {
46    pub fn into_raw(self) -> String {
47        match self {
48            Token::Ident { raw, .. } => raw,
49            Token::Punctuation { raw } => raw,
50            Token::String { raw, .. } => raw,
51            Token::Number { raw } => raw,
52            Token::Comment { raw, .. } => raw,
53            Token::Eof => "<EOF>".to_owned(),
54        }
55    }
56    pub fn raw(&self) -> &str {
57        match self {
58            Token::Ident { raw, .. } => raw,
59            Token::Punctuation { raw } => raw,
60            Token::String { raw, .. } => raw,
61            Token::Number { raw } => raw,
62            Token::Comment { raw, .. } => raw,
63            Token::Eof => "<EOF>",
64        }
65    }
66    pub fn is_eof(&self) -> bool {
67        matches!(self, Self::Eof)
68    }
69    pub fn is_comment(&self) -> bool {
70        matches!(self, Self::Comment { .. })
71    }
72}
73
74struct Lexer {
75    // source: SourceFile,
76    reader: peek2::Reader<char>,
77    next_recording_id: u64,
78    recordings: HashMap<u64, String>,
79}
80
81impl Lexer {
82    fn next_token(&mut self) -> Result<SpannedToken, Error> {
83        match self.next_token_impl() {
84            Ok(result) => Ok(result),
85            Err(message) => {
86                let start = self.reader.position();
87                Err(message.at(Span {
88                    filename: self.reader.filename().to_owned(),
89                    start,
90                    end: match self.reader.peek() {
91                        Some('\n') => Position {
92                            index: start.index + 1,
93                            line: start.line + 1,
94                            column: 1,
95                        },
96                        Some(_) => Position {
97                            index: start.index + 1,
98                            line: start.line,
99                            column: start.column + 1,
100                        },
101                        None => start,
102                    },
103                }))
104            }
105        }
106    }
107    fn next_token_impl(&mut self) -> Result<SpannedToken> {
108        self.skip_whitespace();
109        let start = self.reader.position();
110        let token = [
111            Self::read_simple_comment,
112            Self::read_long_comment,
113            Self::read_string,
114            Self::read_ident,
115            Self::read_number,
116            Self::read_punctuation,
117        ]
118        .into_iter()
119        .find_map(|f| f(self).transpose())
120        .transpose()?;
121        let token = match token {
122            None => {
123                if let Some(c) = self.reader.peek() {
124                    return error!("Unexpected char {c:?}");
125                }
126                Token::Eof
127            }
128            Some(token) => token,
129        };
130        let end = self.reader.position();
131        Ok(SpannedToken {
132            token,
133            span: Span {
134                start,
135                end,
136                filename: self.reader.filename().to_owned(),
137            },
138        })
139    }
140    fn skip_whitespace(&mut self) {
141        while self.reader.peek().map_or(false, |c| c.is_whitespace()) {
142            self.next().unwrap();
143        }
144    }
145    fn skip_char(&mut self, expected: char) -> Result<()> {
146        match self.reader.peek() {
147            None => error!("expected {expected:?}, got EOF"),
148            Some(&actual) if actual == expected => {
149                self.next().unwrap();
150                Ok(())
151            }
152            Some(&actual) => error!("expected {expected:?}, got {actual:?}"),
153        }
154    }
155
156    fn read_while(&mut self, mut f: impl FnMut(char) -> bool) -> Result<String> {
157        let mut result = String::new();
158        while let Some(&c) = self.reader.peek() {
159            if f(c) {
160                result.push(c);
161                self.next().unwrap();
162            } else {
163                break;
164            }
165        }
166        Ok(result)
167    }
168}
169
170struct RecordingToken(u64);
171
172impl Lexer {
173    fn start_recording(&mut self) -> RecordingToken {
174        let id = self.next_recording_id;
175        self.next_recording_id += 1;
176        self.recordings.insert(id, String::new());
177        RecordingToken(id)
178    }
179    fn stop_recording(&mut self, token: RecordingToken) -> String {
180        self.recordings.remove(&token.0).unwrap()
181    }
182    fn next(&mut self) -> Option<char> {
183        let next = self.reader.next();
184        if let Some(c) = next {
185            for recording in self.recordings.values_mut() {
186                recording.push(c);
187            }
188        }
189        next
190    }
191}
192
193impl Lexer {
194    fn read_long_comment(&mut self) -> Result<Option<Token>> {
195        if self.reader.peek() != Some(&'/') {
196            return Ok(None);
197        }
198        if self.reader.peek2() != Some(&'*') {
199            return Ok(None);
200        }
201        let raw = self.start_recording();
202        self.skip_char('/')?;
203        self.skip_char('*')?;
204        let mut prev = ['?', '?']; // just some random symbol
205        Ok(Some(Token::Comment {
206            contents: self.read_while(|c| {
207                if prev == ['*', '/'] {
208                    return false;
209                }
210                let [_prev1, prev2] = prev;
211                prev = [prev2, c];
212                true
213            })?,
214            raw: self.stop_recording(raw),
215        }))
216    }
217    fn read_simple_comment(&mut self) -> Result<Option<Token>> {
218        if self.reader.peek() != Some(&'#') {
219            return Ok(None);
220        }
221        let raw = self.start_recording();
222        self.skip_char('#')?;
223        Ok(Some(Token::Comment {
224            contents: self.read_while(|c| c != '\n')?,
225            raw: self.stop_recording(raw),
226        }))
227    }
228    fn read_string(&mut self) -> Result<Option<Token>> {
229        [StringType::SingleQuoted, StringType::DoubleQuoted]
230            .into_iter()
231            .find_map(|typ| self.read_string_of(typ).transpose())
232            .transpose()
233    }
234    fn read_string_of(&mut self, typ: StringType) -> Result<Option<Token>> {
235        let quote_char = match typ {
236            StringType::SingleQuoted => '\'',
237            StringType::DoubleQuoted => '"',
238        };
239        if self.reader.peek() != Some(&quote_char) {
240            return Ok(None);
241        }
242        let raw = self.start_recording();
243        self.skip_char(quote_char)?;
244        let mut contents = String::new();
245        while let Some(&c) = self.reader.peek() {
246            if c == quote_char {
247                break;
248            }
249            self.next().unwrap();
250            if c == '\\' {
251                contents.push(match self.next() {
252                    None => return error!("Expected escaped character, got EOF"),
253                    Some('n') => '\n',
254                    Some('r') => '\r',
255                    Some('t') => '\t',
256                    Some('\\') => '\\',
257                    Some('x') => {
258                        let mut read_digit = || match self.next() {
259                            Some(c) => match c.to_digit(16) {
260                                Some(digit) => Ok(digit),
261                                None => error!("Expected a hex digit, got {c:?}"),
262                            },
263                            None => error!("Expected a hex digit, got EOF"),
264                        };
265                        let digit1 = read_digit()?;
266                        let digit2 = read_digit()?;
267                        let char_code = digit1 * 16 + digit2;
268                        char::from_u32(char_code)
269                            .ok_or(error_fmt!("{char_code:?} is not a valid char code"))?
270                    }
271                    Some(c) => c,
272                });
273            } else {
274                contents.push(c);
275            }
276        }
277        self.skip_char(quote_char)?;
278        Ok(Some(Token::String {
279            raw: self.stop_recording(raw),
280            contents,
281            typ,
282        }))
283    }
284    fn read_ident(&mut self) -> Result<Option<Token>> {
285        let peeked = match self.reader.peek() {
286            Some(&c) => c,
287            None => return Ok(None),
288        };
289        match peeked {
290            '@' => {
291                let raw = self.start_recording();
292                self.next().unwrap();
293                let Some(Token::String { contents: name, .. }) = self.read_string()? else {
294                    return error!("Expected a string token after '@' for raw identifier");
295                };
296                Ok(Some(Token::Ident {
297                    name,
298                    raw: self.stop_recording(raw),
299                    is_raw: true,
300                }))
301            }
302            c if c.is_alphabetic() || c == '_' => {
303                let mut name = String::new();
304                while let Some(&c) = self.reader.peek() {
305                    let is_good = |c: char| c.is_alphanumeric() || c == '_';
306                    if is_good(c) || c == '-' && self.reader.peek2().map_or(false, |&c| is_good(c))
307                    {
308                        name.push(c);
309                        self.next().unwrap();
310                    } else {
311                        break;
312                    }
313                }
314                Ok(Some(Token::Ident {
315                    raw: name.clone(),
316                    name,
317                    is_raw: false,
318                }))
319            }
320            _ => Ok(None),
321        }
322    }
323    fn read_number(&mut self) -> Result<Option<Token>> {
324        let peeked = match self.reader.peek() {
325            Some(&c) => c,
326            None => return Ok(None),
327        };
328        if !peeked.is_ascii_digit() {
329            return Ok(None);
330        }
331        let mut seen_dot = false;
332        let raw = self.read_while(|c| {
333            c.is_ascii_digit() || c == '.' && !std::mem::replace(&mut seen_dot, true) || c == '_'
334        })?;
335        Ok(Some(Token::Number { raw }))
336    }
337    fn read_punctuation(&mut self) -> Result<Option<Token>> {
338        let is_single_punctuation = |c: char| "(){}[]".contains(c);
339        let is_single_char_punctuation = |c: char| ";".contains(c);
340        match self.reader.peek() {
341            Some(&first) if is_punctuation(first) => {
342                if is_single_punctuation(first) {
343                    self.next().unwrap();
344                    Ok(Some(Token::Punctuation {
345                        raw: first.to_string(),
346                    }))
347                } else if is_single_char_punctuation(first) {
348                    let raw = self.read_while(|c| c == first)?;
349                    Ok(Some(Token::Punctuation { raw }))
350                } else {
351                    let raw = self.read_while(|c| {
352                        is_punctuation(c)
353                            && !is_single_punctuation(c)
354                            && !is_single_char_punctuation(c)
355                    })?;
356                    Ok(Some(Token::Punctuation { raw }))
357                }
358            }
359            _ => Ok(None),
360        }
361    }
362}
363
364pub fn is_punctuation(c: char) -> bool {
365    !(c.is_alphanumeric() || "_'\"@".contains(c) || c.is_whitespace())
366}
367
368#[derive(Debug)]
369pub struct SpannedToken {
370    pub token: Token,
371    pub span: Span,
372}
373
374impl peek2::ReadableItem for SpannedToken {
375    fn advance_position(&self) -> peek2::AdvancePosition {
376        peek2::AdvancePosition::SetTo(self.span.start)
377    }
378}
379
380impl std::ops::Deref for SpannedToken {
381    type Target = Token;
382    fn deref(&self) -> &Self::Target {
383        &self.token
384    }
385}
386
387pub fn lex(source: SourceFile) -> Result<peek2::Reader<SpannedToken>, Error> {
388    let filename = source.filename.clone();
389    let mut lexer = Lexer {
390        next_recording_id: 0,
391        recordings: HashMap::new(),
392        reader: peek2::Reader::read(source),
393    };
394    let mut tokens = Vec::new();
395    loop {
396        let token = lexer.next_token()?;
397        let eof = token.token.is_eof();
398        tokens.push(token);
399        if eof {
400            break;
401        }
402    }
403    Ok(peek2::Reader::new(filename, tokens))
404}