1use crate::token::Span;
28use crate::Error;
29use std::borrow::Cow;
30use std::char;
31use std::fmt;
32use std::slice;
33use std::str;
34use std::str::Utf8Error;
35
36#[derive(Clone)]
42pub struct Lexer<'a> {
43 input: &'a str,
44 allow_confusing_unicode: bool,
45}
46
47#[derive(Copy, Clone, Debug, PartialEq)]
49pub struct Token {
50 pub kind: TokenKind,
53 pub offset: usize,
56 pub len: u32,
62}
63
64#[test]
65fn token_is_not_too_big() {
66 assert!(std::mem::size_of::<Token>() <= std::mem::size_of::<u64>() * 2);
67}
68
69#[derive(Copy, Clone, Debug, PartialEq)]
74pub enum TokenKind {
75 LineComment,
77
78 BlockComment,
81
82 Whitespace,
84
85 LParen,
87 RParen,
89
90 String,
92
93 Id,
98
99 Keyword,
103
104 Annotation,
109
110 Reserved,
113
114 Integer(IntegerKind),
116
117 Float(FloatKind),
119}
120
121#[derive(Copy, Clone, Debug, PartialEq)]
123pub struct IntegerKind {
124 sign: Option<SignToken>,
125 has_underscores: bool,
126 hex: bool,
127}
128
129#[allow(missing_docs)]
131#[derive(Copy, Clone, Debug, PartialEq)]
132pub enum FloatKind {
133 #[doc(hidden)]
134 Inf { negative: bool },
135 #[doc(hidden)]
136 Nan { negative: bool },
137 #[doc(hidden)]
138 NanVal {
139 negative: bool,
140 has_underscores: bool,
141 },
142 #[doc(hidden)]
143 Normal { has_underscores: bool, hex: bool },
144}
145
146enum ReservedKind {
147 String,
149 Idchars,
151 IdString,
153 AnnotationString,
155 Reserved,
157}
158
159#[derive(Debug, Clone, PartialEq, Eq)]
164#[non_exhaustive]
165pub enum LexError {
166 DanglingBlockComment,
169
170 Unexpected(char),
173
174 InvalidStringElement(char),
176
177 InvalidStringEscape(char),
180
181 InvalidHexDigit(char),
183
184 InvalidDigit(char),
186
187 Expected {
190 wanted: char,
192 found: char,
194 },
195
196 UnexpectedEof,
198
199 NumberTooBig,
202
203 InvalidUnicodeValue(u32),
206
207 LoneUnderscore,
210
211 ConfusingUnicode(char),
217
218 InvalidUtf8Id(Utf8Error),
221
222 EmptyId,
224
225 EmptyAnnotation,
227}
228
229#[derive(Clone, Copy, Debug, PartialEq, Eq)]
231pub enum SignToken {
232 Plus,
234 Minus,
236}
237
238#[derive(Debug, PartialEq)]
241pub struct Integer<'a> {
242 sign: Option<SignToken>,
243 val: Cow<'a, str>,
244 hex: bool,
245}
246
247#[derive(Debug, PartialEq, Eq)]
249pub enum Float<'a> {
250 Nan {
252 val: Option<Cow<'a, str>>,
254 negative: bool,
256 },
257 Inf {
259 #[allow(missing_docs)]
260 negative: bool,
261 },
262 Val {
264 hex: bool,
266 integral: Cow<'a, str>,
268 fractional: Option<Cow<'a, str>>,
270 exponent: Option<Cow<'a, str>>,
274 },
275}
276
277macro_rules! idchars {
279 () => {
280 b'0'..=b'9'
281 | b'A'..=b'Z'
282 | b'a'..=b'z'
283 | b'!'
284 | b'#'
285 | b'$'
286 | b'%'
287 | b'&'
288 | b'\''
289 | b'*'
290 | b'+'
291 | b'-'
292 | b'.'
293 | b'/'
294 | b':'
295 | b'<'
296 | b'='
297 | b'>'
298 | b'?'
299 | b'@'
300 | b'\\'
301 | b'^'
302 | b'_'
303 | b'`'
304 | b'|'
305 | b'~'
306 }
307}
308
309impl<'a> Lexer<'a> {
310 pub fn new(input: &str) -> Lexer<'_> {
312 Lexer {
313 input,
314 allow_confusing_unicode: false,
315 }
316 }
317
318 pub fn input(&self) -> &'a str {
320 self.input
321 }
322
323 pub fn allow_confusing_unicode(&mut self, allow: bool) -> &mut Self {
335 self.allow_confusing_unicode = allow;
336 self
337 }
338
339 pub fn parse(&self, pos: &mut usize) -> Result<Option<Token>, Error> {
350 let offset = *pos;
351 Ok(match self.parse_kind(pos)? {
352 Some(kind) => Some(Token {
353 kind,
354 offset,
355 len: (*pos - offset).try_into().unwrap(),
356 }),
357 None => None,
358 })
359 }
360
361 fn parse_kind(&self, pos: &mut usize) -> Result<Option<TokenKind>, Error> {
362 let start = *pos;
363 let remaining = &self.input.as_bytes()[start..];
367 let byte = match remaining.first() {
368 Some(b) => b,
369 None => return Ok(None),
370 };
371
372 match byte {
373 b'(' => match remaining.get(1) {
377 Some(b';') => {
378 let mut level = 1;
379 let mut iter = remaining[2..].iter();
390 while let Some(ch) = iter.next() {
391 match ch {
392 b'(' => {
393 if let Some(b';') = iter.as_slice().first() {
394 level += 1;
395 iter.next();
396 }
397 }
398 b';' => {
399 if let Some(b')') = iter.as_slice().first() {
400 level -= 1;
401 iter.next();
402 if level == 0 {
403 let len = remaining.len() - iter.as_slice().len();
404 let comment = &self.input[start..][..len];
405 *pos += len;
406 self.check_confusing_comment(*pos, comment)?;
407 return Ok(Some(TokenKind::BlockComment));
408 }
409 }
410 }
411 _ => {}
412 }
413 }
414 Err(self.error(start, LexError::DanglingBlockComment))
415 }
416 _ => {
417 *pos += 1;
418
419 Ok(Some(TokenKind::LParen))
420 }
421 },
422
423 b')' => {
424 *pos += 1;
425 Ok(Some(TokenKind::RParen))
426 }
427
428 b' ' | b'\n' | b'\r' | b'\t' => {
430 self.skip_ws(pos);
431 Ok(Some(TokenKind::Whitespace))
432 }
433
434 c @ (idchars!() | b'"') => {
435 let (kind, src) = self.parse_reserved(pos)?;
436 match kind {
437 ReservedKind::String => return Ok(Some(TokenKind::String)),
440
441 ReservedKind::Idchars => {
444 if let Some(ret) = self.classify_number(src) {
446 return Ok(Some(ret));
447 } else if *c == b'$' {
449 return Ok(Some(TokenKind::Id));
450 } else if *c == b'@' {
453 return Ok(Some(TokenKind::Annotation));
454 } else if b'a' <= *c && *c <= b'z' {
456 return Ok(Some(TokenKind::Keyword));
457 }
458 }
459
460 ReservedKind::IdString => return Ok(Some(TokenKind::Id)),
461 ReservedKind::AnnotationString => return Ok(Some(TokenKind::Annotation)),
462
463 ReservedKind::Reserved => {}
468 }
469
470 Ok(Some(TokenKind::Reserved))
471 }
472
473 b';' => match remaining.get(1) {
479 Some(b';') => {
480 let remaining = &self.input[*pos..];
481 let byte_pos = memchr::memchr2(b'\n', b'\r', remaining.as_bytes())
482 .unwrap_or(remaining.len());
483 *pos += byte_pos;
484 let comment = &remaining[..byte_pos];
485 self.check_confusing_comment(*pos, comment)?;
486 Ok(Some(TokenKind::LineComment))
487 }
488 _ => {
489 *pos += 1;
490 Ok(Some(TokenKind::Reserved))
491 }
492 },
493
494 b',' | b'[' | b']' | b'{' | b'}' => {
499 *pos += 1;
500 Ok(Some(TokenKind::Reserved))
501 }
502
503 _ => {
504 let ch = self.input[start..].chars().next().unwrap();
505 Err(self.error(*pos, LexError::Unexpected(ch)))
506 }
507 }
508 }
509
510 fn skip_ws(&self, pos: &mut usize) {
511 #[rustfmt::skip]
528 const WS: [u8; 256] = [
529 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
531 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
538 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
540 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
545 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 ];
548 let remaining = &self.input[*pos..];
549 let non_ws_pos = remaining
550 .as_bytes()
551 .iter()
552 .position(|b| WS[*b as usize] != 1)
553 .unwrap_or(remaining.len());
554 *pos += non_ws_pos;
555 }
556
557 fn parse_reserved(&self, pos: &mut usize) -> Result<(ReservedKind, &'a str), Error> {
573 let mut idchars = 0u32;
574 let mut strings = 0u32;
575 let start = *pos;
576 while let Some(byte) = self.input.as_bytes().get(*pos) {
577 match byte {
578 idchars!() => {
581 idchars += 1;
582 *pos += 1;
583 }
584
585 b'"' => {
587 strings += 1;
588 *pos += 1;
589 let mut it = self.input[*pos..].chars();
590 let result = Lexer::parse_str(&mut it, self.allow_confusing_unicode);
591 *pos = self.input.len() - it.as_str().len();
592 match result {
593 Ok(_) => {}
594 Err(e) => {
595 let err_pos = match &e {
596 LexError::UnexpectedEof => self.input.len(),
597 _ => self.input[..*pos].char_indices().next_back().unwrap().0,
598 };
599 return Err(self.error(err_pos, e));
600 }
601 }
602 }
603
604 _ => break,
606 }
607 }
608 let ret = &self.input[start..*pos];
609 Ok(match (idchars, strings) {
610 (0, 0) => unreachable!(),
611 (0, 1) => (ReservedKind::String, ret),
612 (_, 0) => (ReservedKind::Idchars, ret),
613 (1, 1) if ret.starts_with("$") => (ReservedKind::IdString, ret),
616 (1, 1) if ret.starts_with("@") => (ReservedKind::AnnotationString, ret),
617 _ => (ReservedKind::Reserved, ret),
618 })
619 }
620
621 fn classify_number(&self, src: &str) -> Option<TokenKind> {
622 let (sign, num) = if let Some(stripped) = src.strip_prefix('+') {
623 (Some(SignToken::Plus), stripped)
624 } else if let Some(stripped) = src.strip_prefix('-') {
625 (Some(SignToken::Minus), stripped)
626 } else {
627 (None, src)
628 };
629
630 let negative = sign == Some(SignToken::Minus);
631
632 if num == "inf" {
634 return Some(TokenKind::Float(FloatKind::Inf { negative }));
635 } else if num == "nan" {
636 return Some(TokenKind::Float(FloatKind::Nan { negative }));
637 } else if let Some(stripped) = num.strip_prefix("nan:0x") {
638 let mut it = stripped.as_bytes().iter();
639 let has_underscores = skip_underscores(&mut it, |x| char::from(x).is_ascii_hexdigit())?;
640 if it.next().is_some() {
641 return None;
642 }
643 return Some(TokenKind::Float(FloatKind::NanVal {
644 negative,
645 has_underscores,
646 }));
647 }
648
649 let test_valid: fn(u8) -> bool;
651 let (mut it, hex) = if let Some(stripped) = num.strip_prefix("0x") {
652 test_valid = |x: u8| char::from(x).is_ascii_hexdigit();
653 (stripped.as_bytes().iter(), true)
654 } else {
655 test_valid = |x: u8| char::from(x).is_ascii_digit();
656 (num.as_bytes().iter(), false)
657 };
658
659 let mut has_underscores = skip_underscores(&mut it, test_valid)?;
661
662 match it.clone().next() {
663 Some(_) => {}
665
666 None => {
668 return Some(TokenKind::Integer(IntegerKind {
669 has_underscores,
670 sign,
671 hex,
672 }))
673 }
674 }
675
676 if it.clone().next() == Some(&b'.') {
679 it.next();
680 match it.clone().next() {
681 Some(c) if test_valid(*c) => {
682 if skip_underscores(&mut it, test_valid)? {
683 has_underscores = true;
684 }
685 }
686 Some(_) | None => {}
687 }
688 };
689
690 match (hex, it.next()) {
693 (true, Some(b'p')) | (true, Some(b'P')) | (false, Some(b'e')) | (false, Some(b'E')) => {
694 match it.clone().next() {
695 Some(b'-') => {
696 it.next();
697 }
698 Some(b'+') => {
699 it.next();
700 }
701 _ => {}
702 }
703 if skip_underscores(&mut it, |x| char::from(x).is_ascii_digit())? {
704 has_underscores = true;
705 }
706 }
707 (_, None) => {}
708 _ => return None,
709 }
710
711 if it.next().is_some() {
714 return None;
715 }
716
717 return Some(TokenKind::Float(FloatKind::Normal {
718 has_underscores,
719 hex,
720 }));
721
722 fn skip_underscores<'a>(
723 it: &mut slice::Iter<'_, u8>,
724 good: fn(u8) -> bool,
725 ) -> Option<bool> {
726 let mut last_underscore = false;
727 let mut has_underscores = false;
728 let first = *it.next()?;
729 if !good(first) {
730 return None;
731 }
732 while let Some(c) = it.clone().next() {
733 if *c == b'_' && !last_underscore {
734 has_underscores = true;
735 it.next();
736 last_underscore = true;
737 continue;
738 }
739 if !good(*c) {
740 break;
741 }
742 last_underscore = false;
743 it.next();
744 }
745 if last_underscore {
746 return None;
747 }
748 Some(has_underscores)
749 }
750 }
751
752 fn check_confusing_comment(&self, end: usize, comment: &str) -> Result<(), Error> {
756 if self.allow_confusing_unicode {
757 return Ok(());
758 }
759
760 let bytes = comment.as_bytes();
770 for pos in memchr::Memchr::new(0xe2, bytes) {
771 if let Some(c) = comment[pos..].chars().next() {
772 if is_confusing_unicode(c) {
773 let pos = end - comment.len() + pos;
778 return Err(self.error(pos, LexError::ConfusingUnicode(c)));
779 }
780 }
781 }
782
783 Ok(())
784 }
785
786 fn parse_str(
787 it: &mut str::Chars<'a>,
788 allow_confusing_unicode: bool,
789 ) -> Result<Cow<'a, [u8]>, LexError> {
790 enum State {
791 Start,
792 String(Vec<u8>),
793 }
794 let orig = it.as_str();
795 let mut state = State::Start;
796 loop {
797 match it.next().ok_or(LexError::UnexpectedEof)? {
798 '"' => break,
799 '\\' => {
800 match state {
801 State::String(_) => {}
802 State::Start => {
803 let pos = orig.len() - it.as_str().len() - 1;
804 state = State::String(orig[..pos].as_bytes().to_vec());
805 }
806 }
807 let buf = match &mut state {
808 State::String(b) => b,
809 State::Start => unreachable!(),
810 };
811 match it.next().ok_or(LexError::UnexpectedEof)? {
812 '"' => buf.push(b'"'),
813 '\'' => buf.push(b'\''),
814 't' => buf.push(b'\t'),
815 'n' => buf.push(b'\n'),
816 'r' => buf.push(b'\r'),
817 '\\' => buf.push(b'\\'),
818 'u' => {
819 Lexer::must_eat_char(it, '{')?;
820 let n = Lexer::hexnum(it)?;
821 let c = char::from_u32(n).ok_or(LexError::InvalidUnicodeValue(n))?;
822 buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
823 Lexer::must_eat_char(it, '}')?;
824 }
825 c1 if c1.is_ascii_hexdigit() => {
826 let c2 = Lexer::hexdigit(it)?;
827 buf.push(to_hex(c1) * 16 + c2);
828 }
829 c => return Err(LexError::InvalidStringEscape(c)),
830 }
831 }
832 c if (c as u32) < 0x20 || c as u32 == 0x7f => {
833 return Err(LexError::InvalidStringElement(c))
834 }
835 c if !allow_confusing_unicode && is_confusing_unicode(c) => {
836 return Err(LexError::ConfusingUnicode(c))
837 }
838 c => match &mut state {
839 State::Start => {}
840 State::String(v) => {
841 v.extend(c.encode_utf8(&mut [0; 4]).as_bytes());
842 }
843 },
844 }
845 }
846 match state {
847 State::Start => Ok(orig[..orig.len() - it.as_str().len() - 1].as_bytes().into()),
848 State::String(s) => Ok(s.into()),
849 }
850 }
851
852 fn parse_name(it: &mut str::Chars<'a>) -> Result<Cow<'a, str>, LexError> {
866 if it.clone().next() == Some('"') {
867 it.next();
868 match Lexer::parse_str(it, true)? {
869 Cow::Borrowed(bytes) => match std::str::from_utf8(bytes) {
870 Ok(s) => Ok(Cow::Borrowed(s)),
871 Err(e) => Err(LexError::InvalidUtf8Id(e)),
872 },
873 Cow::Owned(bytes) => match String::from_utf8(bytes) {
874 Ok(s) => Ok(Cow::Owned(s)),
875 Err(e) => Err(LexError::InvalidUtf8Id(e.utf8_error())),
876 },
877 }
878 } else {
879 Ok(Cow::Borrowed(it.as_str()))
880 }
881 }
882
883 fn hexnum(it: &mut str::Chars<'_>) -> Result<u32, LexError> {
884 let n = Lexer::hexdigit(it)?;
885 let mut last_underscore = false;
886 let mut n = n as u32;
887 while let Some(c) = it.clone().next() {
888 if c == '_' {
889 it.next();
890 last_underscore = true;
891 continue;
892 }
893 if !c.is_ascii_hexdigit() {
894 break;
895 }
896 last_underscore = false;
897 it.next();
898 n = n
899 .checked_mul(16)
900 .and_then(|n| n.checked_add(to_hex(c) as u32))
901 .ok_or(LexError::NumberTooBig)?;
902 }
903 if last_underscore {
904 return Err(LexError::LoneUnderscore);
905 }
906 Ok(n)
907 }
908
909 fn hexdigit(it: &mut str::Chars<'_>) -> Result<u8, LexError> {
913 let ch = Lexer::must_char(it)?;
914 if ch.is_ascii_hexdigit() {
915 Ok(to_hex(ch))
916 } else {
917 Err(LexError::InvalidHexDigit(ch))
918 }
919 }
920
921 fn must_char(it: &mut str::Chars<'_>) -> Result<char, LexError> {
924 it.next().ok_or(LexError::UnexpectedEof)
925 }
926
927 fn must_eat_char(it: &mut str::Chars<'_>, wanted: char) -> Result<(), LexError> {
929 let found = Lexer::must_char(it)?;
930 if wanted == found {
931 Ok(())
932 } else {
933 Err(LexError::Expected { wanted, found })
934 }
935 }
936
937 fn error(&self, pos: usize, kind: LexError) -> Error {
939 Error::lex(Span { offset: pos }, self.input, kind)
940 }
941
942 pub fn iter(&self, mut pos: usize) -> impl Iterator<Item = Result<Token, Error>> + '_ {
945 std::iter::from_fn(move || self.parse(&mut pos).transpose())
946 }
947
948 pub fn annotation(&self, mut pos: usize) -> Result<Option<Token>, Error> {
953 let bytes = self.input.as_bytes();
954 if bytes.get(pos) != Some(&b'@') {
957 return Ok(None);
958 }
959 match self.parse(&mut pos)? {
960 Some(token) => match token.kind {
961 TokenKind::Annotation => Ok(Some(token)),
962 _ => Ok(None),
963 },
964 None => Ok(None),
965 }
966 }
967}
968
969impl Token {
970 pub fn src<'a>(&self, s: &'a str) -> &'a str {
972 &s[self.offset..][..self.len.try_into().unwrap()]
973 }
974
975 pub fn id<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
989 let mut ch = self.src(s).chars();
990 let dollar = ch.next();
991 debug_assert_eq!(dollar, Some('$'));
992 let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
993 if id.is_empty() {
994 return Err(self.error(s, LexError::EmptyId));
995 }
996 Ok(id)
997 }
998
999 pub fn annotation<'a>(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
1013 let mut ch = self.src(s).chars();
1014 let at = ch.next();
1015 debug_assert_eq!(at, Some('@'));
1016 let id = Lexer::parse_name(&mut ch).map_err(|e| self.error(s, e))?;
1017 if id.is_empty() {
1018 return Err(self.error(s, LexError::EmptyAnnotation));
1019 }
1020 Ok(id)
1021 }
1022
1023 pub fn keyword<'a>(&self, s: &'a str) -> &'a str {
1027 self.src(s)
1028 }
1029
1030 pub fn reserved<'a>(&self, s: &'a str) -> &'a str {
1034 self.src(s)
1035 }
1036
1037 pub fn string<'a>(&self, s: &'a str) -> Cow<'a, [u8]> {
1044 let mut ch = self.src(s).chars();
1045 ch.next().unwrap();
1046 Lexer::parse_str(&mut ch, true).unwrap()
1047 }
1048
1049 pub fn float<'a>(&self, s: &'a str, kind: FloatKind) -> Float<'a> {
1056 match kind {
1057 FloatKind::Inf { negative } => Float::Inf { negative },
1058 FloatKind::Nan { negative } => Float::Nan {
1059 val: None,
1060 negative,
1061 },
1062 FloatKind::NanVal {
1063 negative,
1064 has_underscores,
1065 } => {
1066 let src = self.src(s);
1067 let src = if src.starts_with("n") { src } else { &src[1..] };
1068 let mut val = Cow::Borrowed(src.strip_prefix("nan:0x").unwrap());
1069 if has_underscores {
1070 *val.to_mut() = val.replace("_", "");
1071 }
1072 Float::Nan {
1073 val: Some(val),
1074 negative,
1075 }
1076 }
1077 FloatKind::Normal {
1078 has_underscores,
1079 hex,
1080 } => {
1081 let src = self.src(s);
1082 let (integral, fractional, exponent) = match src.find('.') {
1083 Some(i) => {
1084 let integral = &src[..i];
1085 let rest = &src[i + 1..];
1086 let exponent = if hex {
1087 rest.find('p').or_else(|| rest.find('P'))
1088 } else {
1089 rest.find('e').or_else(|| rest.find('E'))
1090 };
1091 match exponent {
1092 Some(i) => (integral, Some(&rest[..i]), Some(&rest[i + 1..])),
1093 None => (integral, Some(rest), None),
1094 }
1095 }
1096 None => {
1097 let exponent = if hex {
1098 src.find('p').or_else(|| src.find('P'))
1099 } else {
1100 src.find('e').or_else(|| src.find('E'))
1101 };
1102 match exponent {
1103 Some(i) => (&src[..i], None, Some(&src[i + 1..])),
1104 None => (src, None, None),
1105 }
1106 }
1107 };
1108 let mut integral = Cow::Borrowed(integral.strip_prefix('+').unwrap_or(integral));
1109 let mut fractional = fractional.and_then(|s| {
1110 if s.is_empty() {
1111 None
1112 } else {
1113 Some(Cow::Borrowed(s))
1114 }
1115 });
1116 let mut exponent =
1117 exponent.map(|s| Cow::Borrowed(s.strip_prefix('+').unwrap_or(s)));
1118 if has_underscores {
1119 *integral.to_mut() = integral.replace("_", "");
1120 if let Some(fractional) = &mut fractional {
1121 *fractional.to_mut() = fractional.replace("_", "");
1122 }
1123 if let Some(exponent) = &mut exponent {
1124 *exponent.to_mut() = exponent.replace("_", "");
1125 }
1126 }
1127 if hex {
1128 *integral.to_mut() = integral.replace("0x", "");
1129 }
1130 Float::Val {
1131 hex,
1132 integral,
1133 fractional,
1134 exponent,
1135 }
1136 }
1137 }
1138 }
1139
1140 pub fn integer<'a>(&self, s: &'a str, kind: IntegerKind) -> Integer<'a> {
1147 let src = self.src(s);
1148 let val = match kind.sign {
1149 Some(SignToken::Plus) => src.strip_prefix('+').unwrap(),
1150 Some(SignToken::Minus) => src,
1151 None => src,
1152 };
1153 let mut val = Cow::Borrowed(val);
1154 if kind.has_underscores {
1155 *val.to_mut() = val.replace("_", "");
1156 }
1157 if kind.hex {
1158 *val.to_mut() = val.replace("0x", "");
1159 }
1160 Integer {
1161 sign: kind.sign,
1162 hex: kind.hex,
1163 val,
1164 }
1165 }
1166
1167 fn error(&self, src: &str, err: LexError) -> Error {
1168 Error::lex(
1169 Span {
1170 offset: self.offset,
1171 },
1172 src,
1173 err,
1174 )
1175 }
1176}
1177
1178impl<'a> Integer<'a> {
1179 pub fn sign(&self) -> Option<SignToken> {
1181 self.sign
1182 }
1183
1184 pub fn val(&self) -> (&str, u32) {
1187 (&self.val, if self.hex { 16 } else { 10 })
1188 }
1189}
1190
1191fn to_hex(c: char) -> u8 {
1192 match c {
1193 'a'..='f' => c as u8 - b'a' + 10,
1194 'A'..='F' => c as u8 - b'A' + 10,
1195 _ => c as u8 - b'0',
1196 }
1197}
1198
1199impl fmt::Display for LexError {
1200 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1201 use LexError::*;
1202 match self {
1203 DanglingBlockComment => f.write_str("unterminated block comment")?,
1204 Unexpected(c) => write!(f, "unexpected character '{}'", escape_char(*c))?,
1205 InvalidStringElement(c) => {
1206 write!(f, "invalid character in string '{}'", escape_char(*c))?
1207 }
1208 InvalidStringEscape(c) => write!(f, "invalid string escape '{}'", escape_char(*c))?,
1209 InvalidHexDigit(c) => write!(f, "invalid hex digit '{}'", escape_char(*c))?,
1210 InvalidDigit(c) => write!(f, "invalid decimal digit '{}'", escape_char(*c))?,
1211 Expected { wanted, found } => write!(
1212 f,
1213 "expected '{}' but found '{}'",
1214 escape_char(*wanted),
1215 escape_char(*found)
1216 )?,
1217 UnexpectedEof => write!(f, "unexpected end-of-file")?,
1218 NumberTooBig => f.write_str("number is too big to parse")?,
1219 InvalidUnicodeValue(c) => write!(f, "invalid unicode scalar value 0x{:x}", c)?,
1220 LoneUnderscore => write!(f, "bare underscore in numeric literal")?,
1221 ConfusingUnicode(c) => write!(f, "likely-confusing unicode character found {:?}", c)?,
1222 InvalidUtf8Id(_) => write!(f, "malformed UTF-8 encoding of string-based id")?,
1223 EmptyId => write!(f, "empty identifier")?,
1224 EmptyAnnotation => write!(f, "empty annotation id")?,
1225 }
1226 Ok(())
1227 }
1228}
1229
1230fn escape_char(c: char) -> String {
1231 match c {
1232 '\t' => String::from("\\t"),
1233 '\r' => String::from("\\r"),
1234 '\n' => String::from("\\n"),
1235 '\\' => String::from("\\\\"),
1236 '\'' => String::from("\\\'"),
1237 '\"' => String::from("\""),
1238 '\x20'..='\x7e' => String::from(c),
1239 _ => c.escape_unicode().to_string(),
1240 }
1241}
1242
1243fn is_confusing_unicode(ch: char) -> bool {
1252 matches!(
1253 ch,
1254 '\u{202a}'
1255 | '\u{202b}'
1256 | '\u{202d}'
1257 | '\u{202e}'
1258 | '\u{2066}'
1259 | '\u{2067}'
1260 | '\u{2068}'
1261 | '\u{206c}'
1262 | '\u{2069}'
1263 )
1264}
1265
1266#[cfg(test)]
1267mod tests {
1268 use super::*;
1269
1270 #[test]
1271 fn ws_smoke() {
1272 fn get_whitespace(input: &str) -> &str {
1273 let token = get_token(input);
1274 match token.kind {
1275 TokenKind::Whitespace => token.src(input),
1276 other => panic!("unexpected {:?}", other),
1277 }
1278 }
1279 assert_eq!(get_whitespace(" "), " ");
1280 assert_eq!(get_whitespace(" "), " ");
1281 assert_eq!(get_whitespace(" \n "), " \n ");
1282 assert_eq!(get_whitespace(" x"), " ");
1283 assert_eq!(get_whitespace(" ;"), " ");
1284 }
1285
1286 #[test]
1287 fn line_comment_smoke() {
1288 fn get_line_comment(input: &str) -> &str {
1289 let token = get_token(input);
1290 match token.kind {
1291 TokenKind::LineComment => token.src(input),
1292 other => panic!("unexpected {:?}", other),
1293 }
1294 }
1295 assert_eq!(get_line_comment(";;"), ";;");
1296 assert_eq!(get_line_comment(";; xyz"), ";; xyz");
1297 assert_eq!(get_line_comment(";; xyz\nabc"), ";; xyz");
1298 assert_eq!(get_line_comment(";;\nabc"), ";;");
1299 assert_eq!(get_line_comment(";; \nabc"), ";; ");
1300 assert_eq!(get_line_comment(";; \rabc"), ";; ");
1301 assert_eq!(get_line_comment(";; \r\nabc"), ";; ");
1302 }
1303
1304 #[test]
1305 fn block_comment_smoke() {
1306 fn get_block_comment(input: &str) -> &str {
1307 let token = get_token(input);
1308 match token.kind {
1309 TokenKind::BlockComment => token.src(input),
1310 other => panic!("unexpected {:?}", other),
1311 }
1312 }
1313 assert_eq!(get_block_comment("(;;)"), "(;;)");
1314 assert_eq!(get_block_comment("(; ;)"), "(; ;)");
1315 assert_eq!(get_block_comment("(; (;;) ;)"), "(; (;;) ;)");
1316 }
1317
1318 fn get_token(input: &str) -> Token {
1319 Lexer::new(input)
1320 .parse(&mut 0)
1321 .expect("no first token")
1322 .expect("no token")
1323 }
1324
1325 #[test]
1326 fn lparen() {
1327 assert_eq!(get_token("((").kind, TokenKind::LParen);
1328 }
1329
1330 #[test]
1331 fn rparen() {
1332 assert_eq!(get_token(")(").kind, TokenKind::RParen);
1333 }
1334
1335 #[test]
1336 fn strings() {
1337 fn get_string(input: &str) -> Vec<u8> {
1338 let token = get_token(input);
1339 match token.kind {
1340 TokenKind::String => token.string(input).to_vec(),
1341 other => panic!("not keyword {:?}", other),
1342 }
1343 }
1344 assert_eq!(&*get_string("\"\""), b"");
1345 assert_eq!(&*get_string("\"a\""), b"a");
1346 assert_eq!(&*get_string("\"a b c d\""), b"a b c d");
1347 assert_eq!(&*get_string("\"\\\"\""), b"\"");
1348 assert_eq!(&*get_string("\"\\'\""), b"'");
1349 assert_eq!(&*get_string("\"\\n\""), b"\n");
1350 assert_eq!(&*get_string("\"\\t\""), b"\t");
1351 assert_eq!(&*get_string("\"\\r\""), b"\r");
1352 assert_eq!(&*get_string("\"\\\\\""), b"\\");
1353 assert_eq!(&*get_string("\"\\01\""), &[1]);
1354 assert_eq!(&*get_string("\"\\u{1}\""), &[1]);
1355 assert_eq!(
1356 &*get_string("\"\\u{0f3}\""),
1357 '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1358 );
1359 assert_eq!(
1360 &*get_string("\"\\u{0_f_3}\""),
1361 '\u{0f3}'.encode_utf8(&mut [0; 4]).as_bytes()
1362 );
1363
1364 for i in 0..=255i32 {
1365 let s = format!("\"\\{:02x}\"", i);
1366 assert_eq!(&*get_string(&s), &[i as u8]);
1367 }
1368 }
1369
1370 #[test]
1371 fn id() {
1372 fn get_id(input: &str) -> String {
1373 let token = get_token(input);
1374 match token.kind {
1375 TokenKind::Id => token.id(input).unwrap().to_string(),
1376 other => panic!("not id {:?}", other),
1377 }
1378 }
1379 assert_eq!(get_id("$x"), "x");
1380 assert_eq!(get_id("$xyz"), "xyz");
1381 assert_eq!(get_id("$x_z"), "x_z");
1382 assert_eq!(get_id("$0^"), "0^");
1383 assert_eq!(get_id("$0^;;"), "0^");
1384 assert_eq!(get_id("$0^ ;;"), "0^");
1385 assert_eq!(get_id("$\"x\" ;;"), "x");
1386 }
1387
1388 #[test]
1389 fn annotation() {
1390 fn get_annotation(input: &str) -> String {
1391 let token = get_token(input);
1392 match token.kind {
1393 TokenKind::Annotation => token.annotation(input).unwrap().to_string(),
1394 other => panic!("not annotation {:?}", other),
1395 }
1396 }
1397 assert_eq!(get_annotation("@foo"), "foo");
1398 assert_eq!(get_annotation("@foo "), "foo");
1399 assert_eq!(get_annotation("@f "), "f");
1400 assert_eq!(get_annotation("@\"x\" "), "x");
1401 assert_eq!(get_annotation("@0 "), "0");
1402 }
1403
1404 #[test]
1405 fn keyword() {
1406 fn get_keyword(input: &str) -> &str {
1407 let token = get_token(input);
1408 match token.kind {
1409 TokenKind::Keyword => token.keyword(input),
1410 other => panic!("not keyword {:?}", other),
1411 }
1412 }
1413 assert_eq!(get_keyword("x"), "x");
1414 assert_eq!(get_keyword("xyz"), "xyz");
1415 assert_eq!(get_keyword("x_z"), "x_z");
1416 assert_eq!(get_keyword("x_z "), "x_z");
1417 assert_eq!(get_keyword("x_z "), "x_z");
1418 }
1419
1420 #[test]
1421 fn reserved() {
1422 fn get_reserved(input: &str) -> &str {
1423 let token = get_token(input);
1424 match token.kind {
1425 TokenKind::Reserved => token.reserved(input),
1426 other => panic!("not reserved {:?}", other),
1427 }
1428 }
1429 assert_eq!(get_reserved("^_x "), "^_x");
1430 }
1431
1432 #[test]
1433 fn integer() {
1434 fn get_integer(input: &str) -> String {
1435 let token = get_token(input);
1436 match token.kind {
1437 TokenKind::Integer(i) => token.integer(input, i).val.to_string(),
1438 other => panic!("not integer {:?}", other),
1439 }
1440 }
1441 assert_eq!(get_integer("1"), "1");
1442 assert_eq!(get_integer("0"), "0");
1443 assert_eq!(get_integer("-1"), "-1");
1444 assert_eq!(get_integer("+1"), "1");
1445 assert_eq!(get_integer("+1_000"), "1000");
1446 assert_eq!(get_integer("+1_0_0_0"), "1000");
1447 assert_eq!(get_integer("+0x10"), "10");
1448 assert_eq!(get_integer("-0x10"), "-10");
1449 assert_eq!(get_integer("0x10"), "10");
1450 }
1451
1452 #[test]
1453 fn float() {
1454 fn get_float(input: &str) -> Float<'_> {
1455 let token = get_token(input);
1456 match token.kind {
1457 TokenKind::Float(f) => token.float(input, f),
1458 other => panic!("not float {:?}", other),
1459 }
1460 }
1461 assert_eq!(
1462 get_float("nan"),
1463 Float::Nan {
1464 val: None,
1465 negative: false
1466 },
1467 );
1468 assert_eq!(
1469 get_float("-nan"),
1470 Float::Nan {
1471 val: None,
1472 negative: true,
1473 },
1474 );
1475 assert_eq!(
1476 get_float("+nan"),
1477 Float::Nan {
1478 val: None,
1479 negative: false,
1480 },
1481 );
1482 assert_eq!(
1483 get_float("+nan:0x1"),
1484 Float::Nan {
1485 val: Some("1".into()),
1486 negative: false,
1487 },
1488 );
1489 assert_eq!(
1490 get_float("nan:0x7f_ffff"),
1491 Float::Nan {
1492 val: Some("7fffff".into()),
1493 negative: false,
1494 },
1495 );
1496 assert_eq!(get_float("inf"), Float::Inf { negative: false });
1497 assert_eq!(get_float("-inf"), Float::Inf { negative: true });
1498 assert_eq!(get_float("+inf"), Float::Inf { negative: false });
1499
1500 assert_eq!(
1501 get_float("1.2"),
1502 Float::Val {
1503 integral: "1".into(),
1504 fractional: Some("2".into()),
1505 exponent: None,
1506 hex: false,
1507 },
1508 );
1509 assert_eq!(
1510 get_float("1.2e3"),
1511 Float::Val {
1512 integral: "1".into(),
1513 fractional: Some("2".into()),
1514 exponent: Some("3".into()),
1515 hex: false,
1516 },
1517 );
1518 assert_eq!(
1519 get_float("-1_2.1_1E+0_1"),
1520 Float::Val {
1521 integral: "-12".into(),
1522 fractional: Some("11".into()),
1523 exponent: Some("01".into()),
1524 hex: false,
1525 },
1526 );
1527 assert_eq!(
1528 get_float("+1_2.1_1E-0_1"),
1529 Float::Val {
1530 integral: "12".into(),
1531 fractional: Some("11".into()),
1532 exponent: Some("-01".into()),
1533 hex: false,
1534 },
1535 );
1536 assert_eq!(
1537 get_float("0x1_2.3_4p5_6"),
1538 Float::Val {
1539 integral: "12".into(),
1540 fractional: Some("34".into()),
1541 exponent: Some("56".into()),
1542 hex: true,
1543 },
1544 );
1545 assert_eq!(
1546 get_float("+0x1_2.3_4P-5_6"),
1547 Float::Val {
1548 integral: "12".into(),
1549 fractional: Some("34".into()),
1550 exponent: Some("-56".into()),
1551 hex: true,
1552 },
1553 );
1554 assert_eq!(
1555 get_float("1."),
1556 Float::Val {
1557 integral: "1".into(),
1558 fractional: None,
1559 exponent: None,
1560 hex: false,
1561 },
1562 );
1563 assert_eq!(
1564 get_float("0x1p-24"),
1565 Float::Val {
1566 integral: "1".into(),
1567 fractional: None,
1568 exponent: Some("-24".into()),
1569 hex: true,
1570 },
1571 );
1572 }
1573}