1mod error;
2
3use std::ops::Range;
4
5use chumsky::prelude::*;
6use error::LexerError;
7use tokens::Token;
8use uuid::Uuid;
9
10pub fn scan(input: &str) -> Result<Vec<(Token, Range<usize>)>, LexerError> {
11 lexer().then_ignore(end()).parse(input).map_err(LexerError)
12}
13
14pub fn lexer() -> impl Parser<char, Vec<(Token, Range<usize>)>, Error = Simple<char>> {
15 let comment = recursive(|comment| {
16 none_of("()")
17 .repeated()
18 .at_least(1)
19 .collect::<String>()
20 .or(comment)
21 .repeated()
22 .delimited_by(just('('), just(')'))
23 .map(|s| format!("({})", s.join("")))
24 })
25 .map(Token::Comment);
26 let identifier = ident().map(Token::Ident);
27 let int = just('-')
28 .or_not()
29 .chain::<char, _, _>(text::int(10))
30 .collect::<String>()
31 .map(|s: String| Token::Int(s.parse().unwrap()));
32 let escape = just('\\').ignore_then(
33 just('\\')
34 .or(just('"'))
35 .or(just('n').to('\n'))
36 .or(just('r').to('\r'))
37 .or(just('t').to('\t')),
38 );
39 let string = just('"')
40 .ignore_then(filter(|c| *c != '\\' && *c != '"').or(escape).repeated())
41 .then_ignore(just('"'))
42 .collect::<String>()
43 .map(|s| {
44 s.replace(r#"\\"#, r#"\"#)
45 .replace(r#"\""#, "\"")
46 .replace(r#"\n"#, "\n")
47 .replace(r#"\r"#, "\r")
48 .replace(r#"\t"#, "\t")
49 })
50 .map(Token::Str);
51 let symbol = just('$')
52 .to(Token::Let)
53 .or(just('~').to(Token::In))
54 .or(just('/').to(Token::Divide))
55 .or(just('!').to(Token::Perform))
56 .or(just(':').to(Token::TypeAnnotation))
57 .or(just('%').to(Token::Trait))
58 .or(just('#').to(Token::Attribute))
59 .or(just('^').to(Token::FromHere))
60 .or(just('+').to(Token::Sum))
61 .or(just('*').to(Token::Product))
62 .or(just(',').to(Token::Comma))
63 .or(just('.').to(Token::Dot))
64 .or(just('>').to(Token::Apply))
65 .or(just('[').to(Token::ArrayBegin))
66 .or(just(']').to(Token::ArrayEnd))
67 .or(just('{').to(Token::SetBegin))
68 .or(just('}').to(Token::SetEnd))
69 .or(just('?').to(Token::Hole))
70 .or(just('_').to(Token::Infer))
71 .or(just('\\').to(Token::Lambda))
72 .or(just('&').to(Token::Reference))
73 .or(just('<').chain(just('!')).to(Token::Continue))
74 .or(just('-').chain(just('>')).to(Token::Arrow))
75 .or(just('=').chain(just('>')).to(Token::EArrow))
76 .or(just('-').to(Token::Minus));
77 let special = just('\'')
78 .ignore_then(text::ident())
79 .try_map(|ident: String, span| match ident.as_str() {
80 "module" => Ok(Token::Include),
81 "import" => Ok(Token::Import),
82 "export" => Ok(Token::Export),
83 "number" => Ok(Token::NumberType),
84 "string" => Ok(Token::StringType),
85 "alias" => Ok(Token::Alias),
86 "brand" => Ok(Token::Brands),
87 "type" => Ok(Token::Type),
88 "this" => Ok(Token::This),
89 "handle" => Ok(Token::Handle),
90 "card" => Ok(Token::Card),
91 "a" => Ok(Token::A),
92 _ => Err(Simple::custom(
93 span,
94 format!(r#"undefined special keyword: {}"#, ident),
95 )),
96 });
97 let brand = just('@').ignore_then(ident()).map(Token::Brand);
98 let uuid = just("'uuid")
99 .then_ignore(text::whitespace())
100 .ignore_then(
101 one_of("0123456789abcdefABCDEF")
102 .repeated()
103 .at_least(4)
104 .separated_by(just('-'))
105 .at_least(1),
106 )
107 .flatten()
108 .collect::<String>()
109 .map(|uuid| uuid.parse::<Uuid>())
110 .try_map(|uuid, span| match uuid {
111 Ok(uuid) => Ok(Token::Uuid(uuid)),
112 Err(_) => Err(Simple::custom(span, "invalid uuid")),
113 });
114 let token = comment
115 .or(uuid)
116 .or(int)
117 .or(string)
118 .or(symbol)
119 .or(special)
120 .or(brand)
121 .or(identifier);
122 let semicolon = just(';').to(()).map_with_span(|_, span: Range<usize>| {
123 vec![(Token::Dot, span.clone()), (Token::Comma, span)]
124 });
125 token
126 .map_with_span(|token, span| (token, span))
127 .padded()
128 .repeated()
129 .at_least(1)
130 .or(semicolon)
131 .repeated()
132 .flatten()
133}
134
135pub fn ident() -> impl Parser<char, String, Error = Simple<char>> + Clone {
136 let assert_not_whitespace = |c: char, span| {
137 if c.is_whitespace() {
138 Err(Simple::custom(span, "invalid character"))
139 } else {
140 Ok(c)
141 }
142 };
143 let non_symbol =
144 none_of(r#"%@/&$<>!#*^?\[]{}_-+=;:~,.()"'1234567890"#).try_map(assert_not_whitespace);
145 let non_symbol_2 = none_of(r#"%/&$<>!#*^?\[]{}+=;:~,.()"#).try_map(assert_not_whitespace);
147
148 non_symbol
149 .chain::<char, _, _>(non_symbol_2.repeated())
150 .collect()
151 .separated_by(text::whitespace())
152 .at_least(1)
153 .map(|ident: Vec<String>| ident.join(" "))
154}
155
156#[cfg(test)]
157mod tests {
158 use super::*;
159
160 #[test]
161 fn all_syntax() {
162 let tokens = lexer()
163 .parse(
164 r#"
165 (defines < 'number -> @incremented 'number >)
166 $ \x -> ^ \ 'number, 'number -> @added 'number >
167 1, x (1 + x): @incremented 'number > ~
168 (increments a value which is padded later)
169 \ 'number -> @incremented 'number > ?.
170 "#,
171 )
172 .unwrap()
173 .into_iter()
174 .map(|(token, _)| token)
175 .collect::<Vec<_>>();
176
177 use Token::*;
178 assert_eq!(
179 tokens,
180 vec![
181 Comment("(defines < 'number -> @incremented 'number >)".into()),
182 Let,
183 Lambda,
184 Ident("x".into()),
185 Arrow,
186 FromHere,
187 Lambda,
188 NumberType,
189 Comma,
190 NumberType,
191 Arrow,
192 Brand("added".into()),
193 NumberType,
194 Apply,
195 Int(1),
196 Comma,
197 Ident("x".into()),
198 Comment("(1 + x)".into()),
199 TypeAnnotation,
200 Brand("incremented".into()),
201 NumberType,
202 Apply,
203 In,
204 Comment("(increments a value which is padded later)".into()),
205 Lambda,
206 NumberType,
207 Arrow,
208 Brand("incremented".into()),
209 NumberType,
210 Apply,
211 Hole,
212 Dot,
213 ]
214 )
215 }
216
217 #[test]
218 fn ident_with_spaces() {
219 assert_eq!(
220 lexer().parse(" the\t\nnumber of apples ").unwrap(),
221 vec![(Token::Ident("the number of apples".into()), 1..23)]
222 );
223 }
224
225 #[test]
226 fn ident_utf8() {
227 assert_eq!(
228 lexer().parse("あ- a0").unwrap(),
229 vec![(Token::Ident("あ- a0".into()), 0..5)]
230 );
231 }
232
233 #[test]
234 fn brand_with_spaces() {
235 assert_eq!(
236 lexer().parse("@あ- a0").unwrap(),
237 vec![(Token::Brand("あ- a0".into()), 0..6)]
238 );
239 }
240
241 #[test]
242 fn string_with_escape() {
243 assert_eq!(
244 lexer()
245 .parse(
246 r#"
247 "\\\n\""
248 "#
249 )
250 .unwrap(),
251 vec![(Token::Str("\\\n\"".into()), 13..21)]
252 );
253 }
254
255 #[test]
256 fn semicolon_to_comma_dot() {
257 assert_eq!(
258 lexer().then_ignore(end()).parse("?;?").unwrap(),
259 vec![
260 (Token::Hole, 0..1),
261 (Token::Dot, 1..2),
262 (Token::Comma, 1..2),
263 (Token::Hole, 2..3)
264 ]
265 );
266 }
267}