tokenizer/
tokenizer.rs

1use syntax_rs::{
2    parse::{Parse, ParseStream},
3    simple_tok_spanned, spec, Span, Spanned,
4};
5use unicode_xid::UnicodeXID;
6
7#[derive(Debug)]
8struct LitInt {
9    value: i64,
10}
11
12impl Parse for LitInt {
13    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
14        fn to_u32(chars: &str) -> syntax_rs::Result<u32> {
15            if chars.len() == 0 {
16                return Err("Expected integer.");
17            }
18            let mut number = 0;
19
20            // We don't need to do .chars() here because we are only dealing with numbers.
21            for c in chars.as_bytes() {
22                let digit: u32 = *c as u32 - 0x30;
23                number *= 10;
24                number += digit;
25            }
26            Ok(number)
27        }
28
29        stream.try_parse(|stream| {
30            let negative = stream.cur().consume('-');
31            let mut value = to_u32(stream.cur().advance_while(|c| c.is_digit(10)))? as i64;
32            if negative {
33                value *= -1;
34            }
35
36            Ok(LitInt { value })
37        })
38    }
39}
40
41#[derive(Debug)]
42struct LitStr {
43    val: String,
44}
45
46impl Parse for LitStr {
47    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
48        stream.parse::<Quote>()?;
49
50        let inside = stream.try_parse(|stream| {
51            Ok(LitStr {
52                val: String::from(stream.cur().advance_while(|c| c != '\"')),
53            })
54        });
55
56        stream.parse::<Quote>()?;
57        inside
58    }
59}
60
61#[derive(Debug)]
62enum Literal {
63    Int(LitInt),
64    String(LitStr),
65}
66
67impl Parse for Literal {
68    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
69        if let Ok(lit_int) = stream.parse::<LitInt>() {
70            Ok(Literal::Int(lit_int))
71        } else if let Ok(lit_str) = stream.parse::<LitStr>() {
72            Ok(Literal::String(lit_str))
73        } else {
74            Err("Expected integer or str.")
75        }
76    }
77}
78
79simple_tok_spanned!(Quote, "\"");
80
81#[derive(Debug)]
82struct Ident {
83    string: String,
84    span: Span,
85}
86
87impl Spanned for Ident {
88    fn span(&self) -> Span {
89        self.span
90    }
91
92    fn span_ref_mut(&mut self) -> &mut Span {
93        &mut self.span
94    }
95}
96
97impl Parse for Ident {
98    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
99        stream.try_parse(|stream| {
100            let snap = stream.snapshot();
101            let mut first_c = false;
102            let slice = stream.cur().advance_while(|c| {
103                if first_c {
104                    first_c = false;
105                    UnicodeXID::is_xid_start(c)
106                } else {
107                    UnicodeXID::is_xid_continue(c)
108                }
109            });
110            if slice.is_empty() {
111                Err("Expected identifier.")
112            } else {
113                Ok(Ident {
114                    string: String::from(slice),
115                    span: stream.since(snap),
116                })
117            }
118        })
119    }
120}
121
122#[derive(Debug)]
123enum Symbol {
124    KwFunction,
125    KwLet,
126    KwIf,
127    Ident(Ident),
128}
129
130impl Parse for Symbol {
131    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
132        let ident: Ident = stream
133            .parse()
134            .map_err(|_e| "Expected identifier, `function`, `let` or `if`.")?;
135
136        Ok(match ident.string.as_str() {
137            "fun" => Symbol::KwFunction,
138            "let" => Symbol::KwLet,
139            "if" => Symbol::KwIf,
140            _ => Symbol::Ident(ident),
141        })
142    }
143}
144
145#[derive(Debug)]
146enum Punctuation {
147    Plus,
148    Minus,
149    Star,
150    Slash,
151}
152
153impl Parse for Punctuation {
154    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
155        stream.try_parse(|stream| {
156            Ok(
157                match stream
158                    .cur()
159                    .advance()
160                    .ok_or("Expected `+`, `-`, `*` or `/`.")?
161                {
162                    '+' => Punctuation::Plus,
163                    '-' => Punctuation::Minus,
164                    '*' => Punctuation::Star,
165                    '/' => Punctuation::Slash,
166                    _ => return Err("Expected `+`, `-`, `*` or `/`."),
167                },
168            )
169        })
170    }
171}
172
173// NOTE: A #[derive(Parse)] and #[derive(Spanned)] will probably be added in the future.
174#[derive(Debug)]
175enum Token {
176    Punctuation(Punctuation),
177    Literal(Literal),
178    Symbol(Symbol),
179}
180
181impl Parse for Token {
182    fn parse(stream: &mut ParseStream) -> syntax_rs::Result<Self> {
183        stream.skip_all(spec::is_whitespace);
184        if let Ok(lit) = stream.parse::<Literal>() {
185            Ok(Token::Literal(lit))
186        } else if let Ok(punctuation) = stream.parse::<Punctuation>() {
187            Ok(Token::Punctuation(punctuation))
188        } else if let Ok(symbol) = stream.parse::<Symbol>() {
189            Ok(Token::Symbol(symbol))
190        } else {
191            Err("Expected punctuation, symbol or literal.")
192        }
193    }
194}
195
196const CODE: &str = "1+2+3+4+5+6+7+8+9+10";
197
198fn main() {
199    println!("{:?}", syntax_rs::exhaustive_parse::<Token>(CODE));
200}