1use std::{iter::Peekable, str::CharIndices};
2
3use ordered_float::NotNan;
4
5pub type Tok<'input> = Token<&'input str>;
6pub type SpannedResult<'input, Loc> = Result<Spanned<'input, Loc>, Error>;
7pub type Spanned<'input, Loc> = (Loc, Tok<'input>, Loc);
8
9#[derive(Clone, PartialEq, Eq, Debug)]
10pub enum Token<S> {
11 LRule,
12 RRule,
13 LBracket,
14 RBracket,
15 Colon,
16 LParen,
17 RParen,
18 Dot,
19 Comma,
20 Null,
21 True,
22 False,
23
24 Sign(S),
25
26 IntegerLiteral(i64),
27 FloatLiteral(NotNan<f64>),
28 StringLiteral(String),
29 Identifier(S),
30 ExtendedIdentifier(S),
31 Invalid(char),
32}
33
34#[derive(thiserror::Error, Clone, Debug, PartialEq, Eq)]
35pub enum Error {
36 #[error("invalid literal")]
37 Literal { start: usize },
38
39 #[error("invalid numeric literal '{}'", .0)]
40 NumericLiteral(String),
41
42 #[error("invalid escape literal '{}'", .0)]
43 InvalidEscape(String),
44}
45
46pub struct Lexer<'input> {
47 input: &'input str,
48 chars: Peekable<CharIndices<'input>>,
49}
50
51impl<'input> Iterator for Lexer<'input> {
56 type Item = SpannedResult<'input, usize>;
57
58 fn next(&mut self) -> Option<Self::Item> {
59 use Token::*;
60
61 loop {
62 if let Some((start, ch)) = self.bump() {
63 let result = match ch {
64 '%' if self.test_peek(|ch| ch == '{') => {
65 self.bump();
66 Some(Ok(self.token(start, LRule)))
67 }
68 '}' => Some(Ok(self.token(start, RRule))),
69 '[' => Some(Ok(self.token(start, LBracket))),
70 ']' => Some(Ok(self.token(start, RBracket))),
71 '(' => Some(Ok(self.token(start, LParen))),
72 ')' => Some(Ok(self.token(start, RParen))),
73
74 '.' if self.test_peek(is_digit) => Some(self.numeric_literal(start)),
75 '.' => Some(Ok(self.token(start, Dot))),
76 ':' => Some(Ok(self.token(start, Colon))),
77 ',' => Some(Ok(self.token(start, Comma))),
78
79 '"' => Some(self.string_literal(start)),
80
81 '+' => Some(Ok(self.token(start, Sign("+")))),
82 '-' => Some(Ok(self.token(start, Sign("-")))),
83 ch if is_ident_start(ch) => Some(Ok(self.identifier(start))),
84 ch if is_digit(ch) => Some(self.numeric_literal(start)),
85
86 ch if ch.is_whitespace() => continue,
87
88 ch => Some(Ok(self.token(start, Invalid(ch)))),
89 };
90
91 return result;
92 } else {
93 return None;
94 }
95 }
96 }
97}
98
99impl<'input> Lexer<'input> {
104 pub fn new(input: &'input str) -> Lexer<'input> {
105 Self {
106 input,
107 chars: input.char_indices().peekable(),
108 }
109 }
110
111 fn bump(&mut self) -> Option<(usize, char)> {
112 self.chars.next()
113 }
114
115 fn peek(&mut self) -> Option<(usize, char)> {
116 self.chars.peek().copied()
117 }
118
119 fn take_while<F>(&mut self, start: usize, mut keep_going: F) -> (usize, &'input str)
120 where
121 F: FnMut(char) -> bool,
122 {
123 self.take_until(start, |c| !keep_going(c))
124 }
125
126 fn take_until<F>(&mut self, start: usize, mut terminate: F) -> (usize, &'input str)
127 where
128 F: FnMut(char) -> bool,
129 {
130 while let Some((end, ch)) = self.peek() {
131 if terminate(ch) {
132 return (end, self.slice(start, end));
133 } else {
134 self.bump();
135 }
136 }
137
138 let loc = self.next_index();
139
140 (loc, self.slice(start, loc))
141 }
142
143 fn test_peek<F>(&mut self, mut test: F) -> bool
144 where
145 F: FnMut(char) -> bool,
146 {
147 self.peek().is_some_and(|(_, ch)| test(ch))
148 }
149
150 fn slice(&self, start: usize, end: usize) -> &'input str {
151 &self.input[start..end]
152 }
153
154 fn next_index(&mut self) -> usize {
155 self.peek().as_ref().map_or(self.input.len(), |l| l.0)
156 }
157
158 fn token(&mut self, start: usize, token: Token<&'input str>) -> Spanned<'input, usize> {
159 let end = self.next_index();
160 self.token2(start, end, token)
161 }
162
163 fn token2(
164 &mut self,
165 start: usize,
166 end: usize,
167 token: Token<&'input str>,
168 ) -> Spanned<'input, usize> {
169 (start, token, end)
170 }
171
172 fn string_literal(&mut self, start: usize) -> SpannedResult<'input, usize> {
173 let content_start = self.next_index();
174
175 loop {
176 let scan_start = self.next_index();
177 self.take_until(scan_start, |c| c == '"' || c == '\\');
178
179 match self.bump() {
180 Some((_, '\\')) => self.bump(),
181 Some((end, '\"')) => {
182 let content = unescape_string_literal(self.slice(content_start, end))?;
183 let end = self.next_index();
184
185 return Ok((start, Token::StringLiteral(content), end));
186 }
187 _ => break,
188 };
189 }
190
191 Err(Error::Literal { start })
192 }
193
194 fn identifier(&mut self, start: usize) -> Spanned<'input, usize> {
195 use Token::*;
196
197 let (end, ident) = self.take_while(start, is_ident_continue);
198
199 let token = match ident {
200 "true" => True,
201 "false" => False,
202 "null" => Null,
203
204 _ if ident.contains('@') || ident.contains('-') => ExtendedIdentifier(ident),
205 _ => Identifier(ident),
206 };
207
208 (start, token, end)
209 }
210
211 fn numeric_literal(&mut self, start: usize) -> SpannedResult<'input, usize> {
212 let mut is_float = false;
213 let (end, num) = self.take_while(start, |ch| {
214 is_digit(ch) || {
215 let is_float_symbol = is_float_literal_symbol(ch);
216 if is_float_symbol {
217 is_float = true;
218 }
219 is_float_symbol
220 }
221 });
222
223 if is_float || num.starts_with('.') {
224 num.parse()
225 .map_err(|_| Error::NumericLiteral(num.to_string()))
226 .map(|n| (start, Token::FloatLiteral(n), end))
227 } else {
228 num.parse()
229 .map_err(|_| Error::NumericLiteral(num.to_string()))
230 .map(|n| (start, Token::IntegerLiteral(n), end))
231 }
232 }
233}
234
235fn is_float_literal_symbol(ch: char) -> bool {
236 matches!(ch, 'e' | 'E' | '-' | '+' | '.')
237}
238
239fn is_ident_start(ch: char) -> bool {
240 matches!(ch, '$' | '@' | '_' | 'a'..='z' | 'A'..='Z')
241}
242
243fn is_ident_continue(ch: char) -> bool {
244 match ch {
245 '0'..='9' => true,
246 '-' => true,
247 ch => is_ident_start(ch),
248 }
249}
250
251fn is_digit(ch: char) -> bool {
252 ch.is_ascii_digit()
253}
254
255fn unescape_string_literal(mut s: &str) -> Result<String, Error> {
256 let mut string = String::with_capacity(s.len());
257 while let Some(i) = s.bytes().position(|b| b == b'\\') {
258 if s.len() > i + 2 {
259 let c = match s.as_bytes()[i..i + 3] {
260 [b'\\', b'\\', b'n'] => '\n',
262 [b'\\', b'\\', b'r'] => '\r',
263 [b'\\', b'\\', b't'] => '\t',
264 _ => '\0',
265 };
266 if c != '\0' {
267 string.push_str(&s[..i]);
268 string.push(c);
269 s = &s[i + 3..];
270 continue;
271 }
272 }
273 if s.len() > i + 1 {
274 let c = match s.as_bytes()[i + 1] {
275 b'\'' => '\'',
276 b'"' => '"',
277 b'\\' => '\\',
278 _ => return Err(Error::InvalidEscape(s.to_owned())),
279 };
280 string.push_str(&s[..i]);
281 string.push(c);
282 s = &s[i + 2..];
283 }
284 }
285
286 string.push_str(s);
287 Ok(string)
288}
289
290pub struct FloatingPointLiteral<'input> {
291 pub integral: Option<&'input str>,
292 pub fraction: Option<&'input str>,
293 pub exponent: Option<Exponent<'input>>,
294}
295
296pub struct Exponent<'input> {
297 pub sign: Option<&'input str>,
298 pub value: &'input str,
299}
300
301#[allow(dead_code)] impl FloatingPointLiteral<'_> {
303 pub fn parse(&self) -> f64 {
304 let mut fp = String::new();
305 fp.push_str(self.integral.unwrap_or_default());
306 if let Some(f) = &self.fraction {
307 fp.push('.');
308 fp.push_str(f);
309 }
310
311 if let Some(exp) = &self.exponent {
312 fp.push('e');
313 fp.push_str(exp.sign.unwrap_or_default());
314 fp.push_str(exp.value);
315 }
316
317 fp.parse().map_err(|_| Error::NumericLiteral(fp)).unwrap()
318 }
319}