june_lang/
scanner.rs

1use crate::token::*;
2use std::io;
3use std::iter;
4use std::result;
5use std::string;
6use thiserror::Error;
7
8#[derive(Error, Debug, Clone, PartialEq)]
9pub enum Error {
10    #[error("io: {0}")]
11    IOError(String),
12    #[error("invalid identifier: {0}")]
13    UTF8Error(#[from] string::FromUtf8Error),
14    #[error("unexpected eof")]
15    UnexpectedEOF,
16    #[error("invalid {0}")]
17    InvalidToken(String),
18    #[error("unknown token: {0}")]
19    UnknownToken(u8),
20    #[error("invalid int: {0}")]
21    IntError(String),
22}
23
24impl From<&io::Error> for Error {
25    fn from(err: &io::Error) -> Error {
26        Error::IOError(format!("{}", err))
27    }
28}
29
30type Result<T> = result::Result<T, Error>;
31
32pub struct Scanner<R: io::BufRead> {
33    bytes: iter::Peekable<io::Bytes<R>>,
34}
35
36fn is_delim(b: u8) -> bool {
37    !b.is_ascii_alphanumeric() && b != b'_'
38}
39
40impl<R: io::BufRead> Scanner<R> {
41    fn peek(&mut self) -> Option<Result<u8>> {
42        let peeked = self.bytes.peek()?;
43        let result = peeked.as_ref().map_err(Error::from).map(|ch| *ch);
44        Some(result)
45    }
46
47    fn advance(&mut self) -> Result<u8> {
48        let result = self.bytes.next().ok_or(Error::UnexpectedEOF)?;
49        result.map_err(|err| (&err).into())
50    }
51
52    fn advance_while(&mut self, f: impl Fn(u8) -> bool) -> Result<String> {
53        let mut buf = Vec::new();
54        while let Some(value) = self.peek() {
55            if !f(value?) {
56                break;
57            }
58            buf.push(self.advance().unwrap());
59        }
60        Ok(String::from_utf8(buf)?)
61    }
62
63    fn eat(&mut self, got: &[u8], want: impl ToString) -> Result<()> {
64        for ch in got {
65            if self.peek() != Some(Ok(*ch)) {
66                return Err(Error::InvalidToken(want.to_string()));
67            }
68            self.advance().unwrap();
69        }
70        Ok(())
71    }
72
73    fn eat_as(&mut self, s: &[u8], tok: Token) -> Result<Token> {
74        self.eat(s, &tok)?;
75        Ok(tok)
76    }
77
78    fn str(&mut self) -> Result<Token> {
79        self.eat(b"\"", "Str")?;
80        let text = self.advance_while(|b| b != b'"')?;
81        self.eat(b"\"", "Str")?;
82        Ok(Token::Str(text))
83    }
84
85    fn int(&mut self) -> Result<Token> {
86        let text = self.advance_while(|b| !is_delim(b))?;
87        let int = text.parse::<i64>().map_err(|_| Error::IntError(text))?;
88        Ok(Token::Int(int))
89    }
90
91    fn keyword_or_ident(&mut self) -> Result<Token> {
92        let text = self.advance_while(|b| !is_delim(b))?;
93        let tok = match text.as_str() {
94            "fn" => Token::Fn,
95            "let" => Token::Let,
96            _ => Token::Ident(text),
97        };
98        Ok(tok)
99    }
100
101    fn skip_whitespace(&mut self) {
102        while matches!(self.peek(), Some(Ok(b' ' | b'\n'))) {
103            self.advance().unwrap();
104        }
105    }
106}
107
108impl<R: io::BufRead> iter::Iterator for Scanner<R> {
109    type Item = result::Result<Token, Error>;
110
111    fn next(&mut self) -> Option<Self::Item> {
112        self.skip_whitespace();
113        let result = self.peek()?.and_then(|b| match b {
114            b'+' => self.eat_as(b"+", Token::Op(Op::Plus)),
115            b'=' => self.eat_as(b"=", Token::Eq),
116            b'(' => self.eat_as(b"(", Token::Lparen),
117            b')' => self.eat_as(b")", Token::Rparen),
118            b'{' => self.eat_as(b"{", Token::Lbrace),
119            b'}' => self.eat_as(b"}", Token::Rbrace),
120            b',' => self.eat_as(b",", Token::Comma),
121            b';' => self.eat_as(b";", Token::Semi),
122            b':' => self.eat_as(b":", Token::Colon),
123            b'"' => self.str(),
124            b if b.is_ascii_digit() => self.int(),
125            b if b.is_ascii_alphabetic() => self.keyword_or_ident(),
126            b => Err(Error::UnknownToken(b)),
127        });
128        Some(result)
129    }
130}
131
132pub fn scan<R: io::BufRead>(r: R) -> Scanner<R> {
133    Scanner { bytes: r.bytes().peekable() }
134}
135
136#[cfg(test)]
137mod tests {
138    use super::*;
139
140    fn scan_all(b: &[u8]) -> Result<Vec<Token>> {
141        scan(b).collect()
142    }
143
144    #[test]
145    fn test_empty() {
146        let input = b"
147        ";
148        let toks = scan_all(input).unwrap();
149        let empty = Vec::<Token>::new();
150        assert_eq!(empty, toks);
151    }
152
153    #[test]
154    fn test() {
155        use Token::*;
156        let input = b"
157            fn foo(bar: int, baz: str) {
158                println(\"hello, world\", 27);
159                {
160                    let foo: int = 7;
161                }
162            }
163        ";
164        let toks = scan_all(input).unwrap();
165        let expected = vec![
166            Fn,
167            Ident(String::from("foo")),
168            Lparen,
169            Ident(String::from("bar")),
170            Colon,
171            Ident(String::from("int")),
172            Comma,
173            Ident(String::from("baz")),
174            Colon,
175            Ident(String::from("str")),
176            Rparen,
177            Lbrace,
178            Ident(String::from("println")),
179            Lparen,
180            Str(String::from("hello, world")),
181            Comma,
182            Int(27),
183            Rparen,
184            Semi,
185            Lbrace,
186            Let,
187            Ident(String::from("foo")),
188            Colon,
189            Ident(String::from("int")),
190            Eq,
191            Int(7),
192            Semi,
193            Rbrace,
194            Rbrace,
195        ];
196        assert_eq!(expected, toks);
197    }
198}