1use std::{collections::HashMap, convert::TryInto};
2
3use num_bigint::BigUint;
4use regex::{CaptureLocations, Regex};
5
6use crate::{
7 error::{Error, ErrorKind},
8 source::Location,
9 token::{self, Token},
10};
11
12const TABSIZE: usize = 8;
13
14pub type LexResult = Result<(Location, Token, Location), Error>;
15
16struct ParseStringError {
17 offset: usize,
18}
19
20impl ParseStringError {
21 fn new(offset: usize) -> ParseStringError {
22 ParseStringError { offset }
23 }
24
25 fn offset(&self) -> usize {
26 self.offset
27 }
28}
29
30pub struct Lexer<'a> {
31 input: &'a str,
32 exhausted: bool,
33 location: Location,
34 indents: Vec<(usize, usize)>,
36 alt_col: usize,
38 pending: usize,
40 parens: isize,
42 newline: bool,
44 caps: CaptureLocations,
45}
46
47impl<'a> Iterator for Lexer<'a> {
48 type Item = LexResult;
49
50 fn next(&mut self) -> Option<Self::Item> {
51 if !self.exhausted {
52 Some(self.get_token())
53 } else {
54 None
55 }
56 }
57}
58
59lazy_static::lazy_static! {
60 static ref TOKEN_RE: Regex = {
61 fn escape_keys<V>(map: &HashMap<&str, V>) -> String {
62 let mut tokens = map.keys().collect::<Vec<_>>();
63 tokens.sort_unstable_by(|a, b| a.len().cmp(&b.len()).reverse());
64 tokens
65 .iter()
66 .map(|x| regex::escape(x))
67 .collect::<Vec<_>>()
68 .join("|")
69 }
70 let pat = format!(
71 r#"^(?x:
72 ( # 1
73 [\ \t\f]+
74 | \\ (?: \r\n | [\r\n] )
75 | \# .*
76 )
77 | ( \r\n | [\r\n] ) # 2
78 | ( $ ) # 3
79 | ( # 4
80 (?:
81 (?: [0-9] (?: _? [0-9] )* )? \. [0-9] (?: _? [0-9] )*
82 | [0-9] (?: _? [0-9] )* \.?
83 ) [eE] [+-]? [0-9] (?: _? [0-9] )*
84 | (?: [0-9] (?: _? [0-9] )* )? \. [0-9] (?: _? [0-9] )*
85 | [0-9] (?: _? [0-9] )* \.
86 ) ( [jJ] )? # 5
87 | ( [0-9] (?: _? [0-9] )* [jJ] ) # 6
88 | ( # 7
89 (?: 0[bB] (?: _? [01] )+ )
90 | (?: 0[oO] (?: _? [0-7] )+ )
91 | (?: 0[xX] (?: _? [0-9a-fA-F] )+ )
92 | (?: [1-9] (?: _? [0-9] )* | 0+ (?: _? 0 )* )
93 )
94 | ( (?-i: rf | fr | rb | br | [rfbu] )? ) # 8
95 (?:
96 ''' ( (?s: [^\\] | \\. )*? ) ''' # 9
97 | """ ( (?s: [^\\] | \\. )*? ) """ # 10
98 | ' ( (?s: [^\\\n'] | \\. )*? ) ' # 11
99 | " ( (?s: [^\\\n"] | \\. )*? ) " # 12
100 | ( ['"] | ''' | """ ) # 13
101 )
102 | ( {symbols} ) # 14
103 | ( {keywords} ) \b # 15
104 | ( [\p{{XID_Start}}_] \p{{XID_Continue}}* ) # 16
105 )"#,
106 symbols = escape_keys(&token::SYMBOLS),
107 keywords = escape_keys(&token::KEYWORDS),
108 );
109 Regex::new(&pat).unwrap()
110 };
111
112 static ref BYTES_RE: Regex = Regex::new(
113 r#"(?x) \\ (?:
114 ( [\\'"abfnrtv] )
115 | ( [1-7]{1,3} )
116 | ( [0-9a-fA-F]{2} )
117 | \n
118 )"#).unwrap();
119 static ref STR_RE: Regex = Regex::new(
120 &[BYTES_RE.as_str(),
121 r#"| \\ (?:
122 N \{ (.*) \}
123 | u ( [0-9a-fA-F]{4} )
124 | U ( [0-9a-fA-F]{8} )
125 )"#].concat()).unwrap();
126}
127
128impl<'a> Lexer<'a> {
129 pub fn new(input: &str) -> Lexer {
130 Lexer {
131 input,
132 exhausted: false,
133 location: Location::new(1, 1),
134 indents: vec![(1, 1)],
135 alt_col: 1,
136 pending: 0,
137 parens: 0,
138 newline: true,
139 caps: TOKEN_RE.capture_locations(),
140 }
141 }
142
143 fn consume(&mut self, nbytes: usize) -> &str {
144 let (consumed, rest) = self.input.split_at(nbytes);
145 self.input = rest;
146 let mut bytes = consumed.bytes().peekable();
147 while let Some(c) = bytes.next() {
148 match c {
149 b'\r' => {
150 if bytes.peek() == Some(&&b'\n') {
151 continue;
152 } else {
153 self.location = Location::new(self.location.line + 1, 1);
154 self.alt_col = 1;
155 }
156 }
157 b'\n' => {
158 self.location = Location::new(self.location.line + 1, 1);
159 self.alt_col = 1;
160 }
161 b'\t' => {
162 self.location = Location::new(
163 self.location.line,
164 self.location.column + TABSIZE - ((self.location.column - 1) % TABSIZE),
165 );
166 self.alt_col += 1;
167 }
168 _ => {
169 self.location = Location::new(self.location.line, self.location.column + 1);
170 self.alt_col += 1;
171 }
172 }
173 }
174 consumed
175 }
176
177 fn matches(&self, nth: usize) -> bool {
178 self.caps.get(nth).is_some()
179 }
180
181 fn get_token(&mut self) -> LexResult {
182 if self.pending > 0 {
183 self.pending -= 1;
185 let loc = Location::new(self.location.line, 0);
186 return Ok((loc, Token::Dedent, loc));
187 }
188
189 if TOKEN_RE.captures_read(&mut self.caps, self.input).is_none() {
190 self.exhausted = true;
191 let c = self.input.chars().next().unwrap();
192 return Err(Error::new(ErrorKind::UnexpectedCharacter(c), self.location));
193 }
194
195 if self.newline && !(self.matches(1) || self.matches(2)) {
196 let col = self.location.column;
198 let (last, alt_last) = *self.indents.last().unwrap();
199 let emit_loc = Location::new(self.location.line, col);
200 if col == last {
201 if self.alt_col != alt_last {
203 self.alt_col = alt_last;
205 return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
206 }
207 } else if col > last {
208 if self.alt_col <= alt_last {
210 self.alt_col = alt_last;
211 return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
212 }
213 self.indents.push((col, self.alt_col));
214 return Ok((emit_loc, Token::Indent, emit_loc));
215 } else {
216 while self.indents.len() > 1 && col < self.indents.last().unwrap().0 {
218 self.pending += 1;
219 self.indents.pop();
220 }
221 let (last, alt_last) = *self.indents.last().unwrap();
222 if col != last {
223 self.exhausted = true;
224 return Err(Error::new(ErrorKind::UnmatchedDedent, emit_loc));
225 }
226 if self.alt_col != alt_last {
227 self.alt_col = alt_last;
228 return Err(Error::new(ErrorKind::MixedTabsAndSpaces, emit_loc));
229 }
230 return self.get_token();
231 }
232 }
233
234 let start = self.location;
235 let match_len = self.caps.get(0).unwrap().1;
236 let lex = &self.input[..match_len];
237 self.consume(match_len);
238 let end = self.location;
239
240 if self.matches(1) {
241 return self.get_token();
243 } else if self.matches(2) {
244 if !self.newline && self.parens == 0 {
246 self.newline = true;
247 return Ok((start, Token::Newline, start));
248 } else {
249 return self.get_token();
251 }
252 }
253
254 self.newline = false;
255
256 if self.matches(3) {
257 self.exhausted = true;
259 Ok((start, Token::Eof, start))
260 } else if self.matches(4) && !self.matches(5) {
261 Ok((start, Token::Float(Self::parse_float(lex).unwrap()), end))
263 } else if self.matches(5) || self.matches(6) {
264 let lex = &lex[..lex.len() - 1]; Ok((
267 start,
268 Token::Imaginary(Self::parse_float(lex).unwrap()),
269 end,
270 ))
271 } else if self.matches(7) {
272 Ok((start, Token::Integer(Self::parse_int(lex).unwrap()), end))
274 } else if self.matches(8) {
275 if self.matches(13) {
277 self.exhausted = true;
279 return Err(Error::new(ErrorKind::UnterminatedString, start));
280 }
281 let prefix = self
282 .caps
283 .get(8)
284 .map(|(start, end)| lex[start..end].to_ascii_lowercase());
285 let raw = prefix.as_ref().map_or(false, |x| x.contains('r'));
286 let formatted = prefix.as_ref().map_or(false, |x| x.contains('f'));
287 let bytes = prefix.as_ref().map_or(false, |x| x.contains('b'));
288 let value = self
289 .caps
290 .get(9)
291 .or_else(|| self.caps.get(10))
292 .or_else(|| self.caps.get(11))
293 .or_else(|| self.caps.get(12))
294 .map(|(start, end)| &lex[start..end])
295 .unwrap();
296 if bytes {
297 let value = if !raw {
298 Self::parse_bytes(value).map_err(|e| {
299 Error::new(ErrorKind::NonAsciiBytes { offset: e.offset() }, start)
300 })?
301 } else {
302 value.as_bytes().to_vec()
303 };
304 Ok((start, Token::Bytes(value), end))
305 } else {
306 let value = if !raw {
307 Self::parse_str(value).map_err(|e| {
308 Error::new(ErrorKind::UnicodeDecode { offset: e.offset() }, start)
309 })?
310 } else {
311 value.to_string()
312 };
313 if formatted {
314 Ok((start, Token::FormattedString(value), end))
315 } else {
316 Ok((start, Token::String(value), end))
317 }
318 }
319 } else if self.matches(14) {
320 let tok = token::SYMBOLS.get(lex).unwrap();
322 match tok {
323 Token::ParenOpen | Token::BracketOpen | Token::BraceOpen => self.parens += 1,
324 Token::ParenClose | Token::BracketClose | Token::BraceClose => self.parens -= 1,
325 _ => (),
326 }
327 Ok((start, tok.clone(), end))
328 } else if self.matches(15) {
329 let tok = token::KEYWORDS.get(lex).unwrap();
331 Ok((start, tok.clone(), end))
332 } else if self.matches(16) {
333 Ok((start, Token::Name(lex.into()), end))
335 } else {
336 unreachable!()
337 }
338 }
339
340 fn parse_bytes(mut s: &str) -> Result<Vec<u8>, ParseStringError> {
341 if let Some(offset) = s.bytes().position(|c| !c.is_ascii()) {
342 return Err(ParseStringError::new(offset));
343 }
344
345 let mut result = vec![];
346 let mut caps = BYTES_RE.capture_locations();
347
348 while let Some(cap) = BYTES_RE.captures_read(&mut caps, s) {
349 result.extend(s[..cap.start()].bytes());
350 s = &s[cap.end()..];
351 let escape = &cap.as_str()[1..];
352 let value = if caps.get(1).is_some() {
353 Self::ascii_escape(escape.as_bytes()[0])
354 } else if caps.get(2).is_some() {
355 u8::from_str_radix(escape, 8).unwrap()
356 } else if caps.get(3).is_some() {
357 u8::from_str_radix(escape, 16).unwrap()
358 } else {
359 continue;
361 };
362 result.push(value);
363 }
364 result.extend(s.bytes());
365
366 Ok(result)
367 }
368
369 fn parse_str(mut s: &str) -> Result<String, ParseStringError> {
370 let mut result = String::new();
371 let mut caps = STR_RE.capture_locations();
372 let mut offset = 0;
373
374 while let Some(cap) = STR_RE.captures_read(&mut caps, s) {
375 result.push_str(&s[..cap.start()]);
376 s = &s[cap.end()..];
377 let escape = &cap.as_str()[1..];
378 let value = if caps.get(1).is_some() {
379 Self::ascii_escape(escape.as_bytes()[0]) as char
380 } else if caps.get(2).is_some() {
381 u8::from_str_radix(escape, 8).unwrap() as char
382 } else if caps.get(3).is_some() {
383 u8::from_str_radix(escape, 16).unwrap() as char
384 } else if caps.get(4).is_some() {
385 unicode_names2::character(&escape[2..escape.len() - 1])
386 .ok_or_else(|| ParseStringError::new(offset))?
387 } else if caps.get(5).is_some() || caps.get(6).is_some() {
388 u32::from_str_radix(&escape[1..], 16)
389 .unwrap()
390 .try_into()
391 .map_err(|_| ParseStringError::new(offset))?
392 } else {
393 offset += 2;
395 continue;
396 };
397 result.push(value);
398 offset += cap.end();
399 }
400 result.push_str(s);
401
402 Ok(result)
403 }
404
405 fn ascii_escape(c: u8) -> u8 {
406 match c {
407 b'n' => b'\n',
408 b'r' => b'\r',
409 b't' => b'\t',
410 b'a' => b'\x07',
411 b'b' => b'\x08',
412 b'f' => b'\x0c',
413 b'v' => b'\x0b',
414 c => c,
415 }
416 }
417
418 fn parse_int(s: &str) -> Option<BigUint> {
419 let mut b = s.as_bytes();
420 let radix = match b.get(1) {
421 Some(b'b') | Some(b'B') => 2,
422 Some(b'o') | Some(b'O') => 8,
423 Some(b'x') | Some(b'X') => 16,
424 _ => 10,
425 };
426 if radix != 10 {
427 b = &b[2..];
428 }
429 if b[0] == b'_' {
430 b = &b[1..];
432 }
433 BigUint::parse_bytes(b, radix)
434 }
435
436 fn parse_float(s: &str) -> Option<f64> {
437 if !s.contains('_') {
438 s.parse().ok()
439 } else {
440 s.replace('_', "").parse().ok()
441 }
442 }
443}
444
445#[cfg(test)]
446mod tests {
447 use super::*;
448 use crate::token::Token::*;
449
450 fn lex(i: &str) -> Vec<Result<Token, ErrorKind>> {
451 Lexer::new(i)
452 .map(|x| x.map(|x| x.1).map_err(|e| e.kind))
453 .collect()
454 }
455
456 macro_rules! assert_lex {
457 ($l:expr, $r:expr) => {{
458 assert_eq!(lex($l), vec![Ok($r), Ok(Eof)])
459 }};
460 }
461
462 #[test]
463 fn test_lex_indent() {
464 assert_eq!(
465 lex("a\n b\nc"),
466 vec![
467 Ok(Name("a".into())),
468 Ok(Newline),
469 Ok(Indent),
470 Ok(Name("b".into())),
471 Ok(Newline),
472 Ok(Dedent),
473 Ok(Name("c".into())),
474 Ok(Eof)
475 ]
476 );
477 assert_eq!(
478 lex(" a\n b"),
479 vec![
480 Ok(Indent),
481 Ok(Name("a".into())),
482 Ok(Newline),
483 Err(ErrorKind::UnmatchedDedent),
484 ]
485 );
486 assert_eq!(
487 lex(" \ta\n\tb"),
488 vec![
489 Ok(Indent),
490 Ok(Name("a".into())),
491 Ok(Newline),
492 Err(ErrorKind::MixedTabsAndSpaces),
493 Ok(Name("b".into())),
494 Ok(Eof)
495 ]
496 );
497 }
498
499 #[test]
500 fn test_lex_newline() {
501 assert_eq!(
502 lex("a\n\r\nb\r"),
503 vec![
504 Ok(Name("a".into())),
505 Ok(Newline),
506 Ok(Name("b".into())),
507 Ok(Newline),
508 Ok(Eof),
509 ]
510 );
511 }
512
513 #[test]
514 fn test_lex_integer() {
515 assert_lex!("1234567890", Integer(1234567890u64.into()));
516 assert_lex!("0b10", Integer(0b10u64.into()));
517 assert_lex!("0o12345670", Integer(0o12345670u64.into()));
518 assert_lex!("0x123456789abcdef0", Integer(0x123456789abcdef0u64.into()));
519 assert_lex!("1_2_3_4_5", Integer(12345u64.into()));
520 assert_lex!("0x_1_2_3", Integer(0x1_2_3u64.into()));
521 assert_lex!("000", Integer(0u64.into()));
522 }
523
524 #[test]
525 fn test_lex_float() {
526 assert_lex!("3.14", Float(3.14));
527 assert_lex!("10.", Float(10.));
528 assert_lex!(".001", Float(0.001));
529 assert_lex!("1e100", Float(1e100));
530 assert_lex!("3.14e-10", Float(3.14e-10));
531 assert_lex!("0e0", Float(0e0));
532 assert_lex!("3.14_15_93", Float(3.14_15_93));
533 assert_lex!("0.1e+10", Float(0.1e+10));
534 assert_lex!("1.e1", Float(1.0e1));
535 }
536
537 #[test]
538 fn lex_string() {
539 assert_lex!("'foo'", String("foo".into()));
540 assert_lex!("'foo\\tbar'", String("foo\tbar".into()));
541 assert_lex!("'\\x'", String("\\x".into()));
542 assert_lex!("'\\N{SNOWMAN}'", String("☃".into()));
543 assert_lex!("b'\\N{SNOWMAN}'", Bytes(b"\\N{SNOWMAN}".to_vec()));
544 assert_lex!("'\\\\\\'\"'", String("\\'\"".into()));
545 assert_lex!("'a \\\nb'", String("a b".into()));
546 assert_lex!("'''a \nb\\\nc'''", String("a \nbc".into()));
547 assert_lex!("r'\\t\\\n\\''", String("\\t\\\n\\'".into()))
548 }
549}