1use std::collections::BTreeSet;
2use std::fmt;
3
4use unicode_xid::UnicodeXID;
5use peresil::combinators::*;
6
7use crate::{Extent, HumanTextError};
8use crate::combinators::{not, peek};
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, Decompose)]
11pub enum Token {
12 LeftAngle(Extent),
14 LeftCurly(Extent),
15 LeftParen(Extent),
16 LeftSquare(Extent),
17 RightAngle(Extent),
18 RightCurly(Extent),
19 RightParen(Extent),
20 RightSquare(Extent),
21
22 Ampersand(Extent),
28 AmpersandEquals(Extent),
29 Asterisk(Extent),
30 At(Extent),
31 Backslash(Extent),
32 Bang(Extent),
33 Caret(Extent),
34 CaretEquals(Extent),
35 Colon(Extent),
36 Comma(Extent),
37 DivideEquals(Extent),
38 Dollar(Extent),
39 DoubleAmpersand(Extent),
40 DoubleColon(Extent),
41 DoubleEquals(Extent),
42 DoubleLeftAngle(Extent),
43 DoublePeriod(Extent),
44 DoublePeriodEquals(Extent),
45 DoublePipe(Extent),
46 DoubleRightAngle(Extent),
47 Equals(Extent),
48 GreaterThanOrEquals(Extent),
49 Hash(Extent),
50 LessThanOrEquals(Extent),
51 Minus(Extent),
52 MinusEquals(Extent),
53 NotEqual(Extent),
54 Percent(Extent),
55 PercentEquals(Extent),
56 Period(Extent),
57 Pipe(Extent),
58 PipeEquals(Extent),
59 Plus(Extent),
60 PlusEquals(Extent),
61 QuestionMark(Extent),
62 Semicolon(Extent),
63 ShiftLeftEquals(Extent),
64 ShiftRightEquals(Extent),
65 Slash(Extent),
66 ThickArrow(Extent),
67 ThinArrow(Extent),
68 Tilde(Extent),
69 TimesEquals(Extent),
70 TriplePeriod(Extent),
71
72 As(Extent),
74 Async(Extent),
75 Auto(Extent),
76 Box(Extent),
77 Break(Extent),
78 Const(Extent),
79 Continue(Extent),
80 Crate(Extent),
81 Default(Extent),
82 Dyn(Extent),
83 Else(Extent),
84 Enum(Extent),
85 Extern(Extent),
86 Fn(Extent),
87 For(Extent),
88 If(Extent),
89 Impl(Extent),
90 In(Extent),
91 Let(Extent),
92 Loop(Extent),
93 Match(Extent),
94 Mod(Extent),
95 Move(Extent),
96 Mut(Extent),
97 Pub(Extent),
98 Ref(Extent),
99 Return(Extent),
100 SelfIdent(Extent),
101 Static(Extent),
102 Struct(Extent),
103 Trait(Extent),
104 Type(Extent),
105 Union(Extent),
106 Unsafe(Extent),
107 Use(Extent),
108 Where(Extent),
109 While(Extent),
110
111 Character(Extent),
113 String(Extent),
114 StringRaw(Extent),
115 Byte(Extent),
116 ByteString(Extent),
117 ByteStringRaw(Extent),
118
119 Ident(Extent),
121 Number(Number),
122 Whitespace(Extent),
123 CommentLine(Extent),
124 CommentBlock(Extent),
125 DocCommentOuterLine(Extent),
126 DocCommentInnerLine(Extent),
127 DocCommentOuterBlock(Extent),
128 DocCommentInnerBlock(Extent),
129 Lifetime(Extent),
130 EndOfFile(Extent),
131}
132
133impl Token {
134 pub fn extent(&self) -> Extent {
135 use self::Token::*;
136
137 match *self {
138 Ampersand(s) |
139 AmpersandEquals(s) |
140 As(s) |
141 Asterisk(s) |
142 Async(s) |
143 At(s) |
144 Auto(s) |
145 Backslash(s) |
146 Bang(s) |
147 Box(s) |
148 Break(s) |
149 Byte(s) |
150 ByteString(s) |
151 ByteStringRaw(s) |
152 Caret(s) |
153 CaretEquals(s) |
154 Character(s) |
155 Colon(s) |
156 Comma(s) |
157 CommentLine(s) |
158 CommentBlock(s) |
159 Const(s) |
160 Continue(s) |
161 Crate(s) |
162 Dyn(s) |
163 Default(s) |
164 DivideEquals(s) |
165 DocCommentInnerLine(s) |
166 DocCommentInnerBlock(s)|
167 DocCommentOuterLine(s) |
168 DocCommentOuterBlock(s)|
169 Dollar(s) |
170 DoubleAmpersand(s) |
171 DoubleColon(s) |
172 DoubleEquals(s) |
173 DoubleLeftAngle(s) |
174 DoublePeriod(s) |
175 DoublePeriodEquals(s) |
176 DoublePipe(s) |
177 DoubleRightAngle(s) |
178 Else(s) |
179 EndOfFile(s) |
180 Enum(s) |
181 Equals(s) |
182 Extern(s) |
183 Fn(s) |
184 For(s) |
185 GreaterThanOrEquals(s) |
186 Hash(s) |
187 Ident(s) |
188 If(s) |
189 Impl(s) |
190 In(s) |
191 LeftAngle(s) |
192 LeftCurly(s) |
193 LeftParen(s) |
194 LeftSquare(s) |
195 LessThanOrEquals(s) |
196 Let(s) |
197 Lifetime(s) |
198 Loop(s) |
199 Match(s) |
200 Minus(s) |
201 MinusEquals(s) |
202 Mod(s) |
203 Move(s) |
204 Mut(s) |
205 NotEqual(s) |
206 Percent(s) |
207 PercentEquals(s) |
208 Period(s) |
209 Pipe(s) |
210 PipeEquals(s) |
211 Plus(s) |
212 PlusEquals(s) |
213 Pub(s) |
214 QuestionMark(s) |
215 Ref(s) |
216 Return(s) |
217 RightAngle(s) |
218 RightCurly(s) |
219 RightParen(s) |
220 RightSquare(s) |
221 SelfIdent(s) |
222 Semicolon(s) |
223 ShiftLeftEquals(s) |
224 ShiftRightEquals(s) |
225 Slash(s) |
226 Static(s) |
227 String(s) |
228 StringRaw(s) |
229 Struct(s) |
230 ThickArrow(s) |
231 ThinArrow(s) |
232 Tilde(s) |
233 TimesEquals(s) |
234 Trait(s) |
235 TriplePeriod(s) |
236 Type(s) |
237 Union(s) |
238 Unsafe(s) |
239 Use(s) |
240 Where(s) |
241 While(s) |
242 Whitespace(s) => s,
243
244 Number(s) => s.extent(),
245 }
246 }
247}
248
249#[derive(Debug, Copy, Clone, PartialEq, Eq, Decompose)]
250pub enum Number {
251 Binary(NumberBinary),
252 Decimal(NumberDecimal),
253 Hexadecimal(NumberHexadecimal),
254 Octal(NumberOctal),
255}
256
257impl Number {
258 fn extent(&self) -> Extent {
259 use self::Number::*;
260
261 match *self {
262 Binary(n) => n.extent(),
263 Decimal(n) => n.extent(),
264 Hexadecimal(n) => n.extent(),
265 Octal(n) => n.extent(),
266 }
267 }
268
269 pub fn into_simple(self) -> Option<Extent> {
270 match self {
271 Number::Decimal(d) => {
272 if d.fractional.is_none() &&
273 d.exponent.is_none() &&
274 d.type_suffix.is_none() &&
275 d.underscores == 0
276 {
277 Some(d.extent)
278 } else {
279 None
280 }
281 }
282 _ => None
283 }
284 }
285}
286
287macro_rules! number {
288 ($name:ident) => {
289 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
290 pub struct $name {
291 pub extent: Extent,
292 pub integral: Extent,
293 pub fractional: Option<Extent>,
294 pub exponent: Option<Extent>,
295 pub type_suffix: Option<Extent>,
296 underscores: usize,
297 }
298
299 impl $name {
300 fn finish(details: NumberDetailsPartial,
301 extent: Extent,
302 exponent: Option<Extent>,
303 type_suffix: Option<Extent>) -> $name
304 {
305 let NumberDetailsPartial { integral, fractional, underscores } = details;
306 $name { extent, integral, fractional, exponent, type_suffix, underscores }
307 }
308
309 pub fn extent(&self) -> Extent {
310 self.extent
311 }
312 }
313 }
314}
315
316number!(NumberBinary);
317number!(NumberDecimal);
318number!(NumberHexadecimal);
319number!(NumberOctal);
320
321#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
322pub(crate) enum Error {
323 Literal(&'static str),
324 ExpectedIdentOrKeyword,
325 ExpectedIdent,
326 ExpectedNumber,
327 ExpectedHex,
328 ExpectedWhitespace,
329 ExpectedComment,
330 ExpectedCharacter,
331 UnterminatedRawString,
332 RawIdentifierMissingIdentifier,
333
334 InvalidFollowForFractionalNumber,
336}
337
338impl peresil::Recoverable for Error {
339 fn recoverable(&self) -> bool {
340 use Error::*;
341
342 match self {
343 RawIdentifierMissingIdentifier => false,
344 _ => true,
345 }
346 }
347}
348
349#[derive(Debug, PartialEq, Eq)]
351pub struct ErrorDetail {
352 location: usize,
353 errors: BTreeSet<Error>,
354}
355
356impl ErrorDetail {
357 pub fn with_text<'a>(&'a self, text: &'a str) -> ErrorDetailText<'a> {
359 ErrorDetailText { detail: self, text }
360 }
361}
362
363#[derive(Debug)]
365pub struct ErrorDetailText<'a> {
366 detail: &'a ErrorDetail,
367 text: &'a str,
368}
369
370impl<'a> fmt::Display for ErrorDetailText<'a> {
371 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
372 let human = HumanTextError::new(self.text, self.detail.location);
373
374 writeln!(f, "Unable to tokenize text (line {}, column {})", human.line, human.column)?;
375 writeln!(f, "{}{}", human.head_of_line, human.tail_of_line)?;
376 writeln!(f, "{:>width$}", "^", width = human.column)?;
377 writeln!(f, "Expected:")?;
378 for e in &self.detail.errors {
379 writeln!(f, " {:?}", e)?; }
381 Ok(())
382 }
383}
384
385type Point<'s> = peresil::StringPoint<'s>;
386type Master<'s> = peresil::ParseMaster<Point<'s>, Error>;
387type Progress<'s, T> = peresil::Progress<Point<'s>, T, Error>;
388
389pub struct Tokens<'s> {
390 pm: Master<'s>,
391 pt: Point<'s>,
392 is_exhausted: bool,
393}
394
395impl<'s> Tokens<'s> {
396 pub fn new(code: &'s str) -> Self {
397 Tokens {
398 pm: Master::new(),
399 pt: Point::new(code),
400 is_exhausted: false,
401 }
402 }
403}
404
405impl<'s> Iterator for Tokens<'s> {
406 type Item = Result<Token, ErrorDetail>;
407
408 fn next(&mut self) -> Option<Self::Item> {
409 if self.is_exhausted {
410 return None
411 }
412
413 if self.pt.s.is_empty() {
414 self.is_exhausted = true;
415 return Some(Ok(Token::EndOfFile(Extent(self.pt.offset, self.pt.offset))));
416 }
417
418 let tok = single_token(&mut self.pm, self.pt);
419 let tok = self.pm.finish(tok);
420
421 match tok {
422 peresil::Progress { status: peresil::Status::Success(value), point } => {
423 assert_ne!(self.pt.offset, point.offset, "Tokenizer did not make progress");
424 self.pt = point;
425 Some(Ok(value))
426 }
427 peresil::Progress { status: peresil::Status::Failure(errors), point } => {
428 Some(Err(ErrorDetail {
429 location: point.offset,
430 errors: errors.into_iter().collect(),
431 }))
432 }
433 }
434 }
435}
436
437fn single_token<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Token> {
438 pm.alternate(pt)
439 .one(comment_or_doc_comment)
440 .one(map(character, Token::Character))
441 .one(map(string, Token::String))
442 .one(map(string_raw, Token::StringRaw))
443 .one(map(byte, Token::Byte))
444 .one(map(byte_string, Token::ByteString))
445 .one(map(byte_string_raw, Token::ByteStringRaw))
446 .one(map(lifetime, Token::Lifetime))
447
448 .one(map(literal(">>="), Token::ShiftRightEquals))
450 .one(map(literal("<<="), Token::ShiftLeftEquals))
451 .one(map(literal("..."), Token::TriplePeriod))
452 .one(map(literal("..="), Token::DoublePeriodEquals))
453
454 .one(map(literal("!="), Token::NotEqual))
456 .one(map(literal("%="), Token::PercentEquals))
457 .one(map(literal("&&"), Token::DoubleAmpersand))
458 .one(map(literal("&="), Token::AmpersandEquals))
459 .one(map(literal("*="), Token::TimesEquals))
460 .one(map(literal("+="), Token::PlusEquals))
461 .one(map(literal("-="), Token::MinusEquals))
462 .one(map(literal("->"), Token::ThinArrow))
463 .one(map(literal("/="), Token::DivideEquals))
464 .one(map(literal("<<"), Token::DoubleLeftAngle))
465 .one(map(literal("<="), Token::LessThanOrEquals))
466 .one(map(literal("=="), Token::DoubleEquals))
467 .one(map(literal("=>"), Token::ThickArrow))
468 .one(map(literal(">="), Token::GreaterThanOrEquals))
469 .one(map(literal(">>"), Token::DoubleRightAngle))
470 .one(map(literal("^="), Token::CaretEquals))
471 .one(map(literal("|="), Token::PipeEquals))
472 .one(map(literal(".."), Token::DoublePeriod))
473 .one(map(literal("::"), Token::DoubleColon))
474 .one(map(literal("||"), Token::DoublePipe))
475
476 .one(map(literal("!"), Token::Bang))
478 .one(map(literal("#"), Token::Hash))
479 .one(map(literal("$"), Token::Dollar))
480 .one(map(literal("%"), Token::Percent))
481 .one(map(literal("&"), Token::Ampersand))
482 .one(map(literal("*"), Token::Asterisk))
483 .one(map(literal("+"), Token::Plus))
484 .one(map(literal(","), Token::Comma))
485 .one(map(literal("-"), Token::Minus))
486 .one(map(literal("."), Token::Period))
487 .one(map(literal("/"), Token::Slash))
488 .one(map(literal(":"), Token::Colon))
489 .one(map(literal(";"), Token::Semicolon))
490 .one(map(literal("="), Token::Equals))
491 .one(map(literal("?"), Token::QuestionMark))
492 .one(map(literal("@"), Token::At))
493 .one(map(literal("^"), Token::Caret))
494 .one(map(literal("|"), Token::Pipe))
495 .one(map(literal("~"), Token::Tilde))
496 .one(map(literal(r#"\"#), Token::Backslash))
497
498 .one(map(literal("("), Token::LeftParen))
500 .one(map(literal(")"), Token::RightParen))
501 .one(map(literal("<"), Token::LeftAngle))
502 .one(map(literal(">"), Token::RightAngle))
503 .one(map(literal("["), Token::LeftSquare))
504 .one(map(literal("]"), Token::RightSquare))
505 .one(map(literal("{"), Token::LeftCurly))
506 .one(map(literal("}"), Token::RightCurly))
507
508 .one(keyword_or_ident)
510 .one(map(number, Token::Number))
511 .one(map(whitespace, Token::Whitespace))
512 .finish()
513}
514
515fn keyword_or_ident<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Token> {
516 if pt.s.starts_with("r#") {
517 let idx = ident_len(&pt.s[2..]);
518 if idx == 0 {
519 return Progress::failure(pt, Error::RawIdentifierMissingIdentifier);
520 }
521 return split_point_at_non_zero_offset(pt, 2 + idx, Error::ExpectedIdentOrKeyword)
522 .map(|(_, extent)| Token::Ident(extent));
523 }
524
525 let idx = ident_len(pt.s);
526
527 split_point_at_non_zero_offset(pt, idx, Error::ExpectedIdentOrKeyword).map(
528 |(s, extent)| match s {
529 "as" => Token::As(extent),
530 "async" => Token::Async(extent),
531 "auto" => Token::Auto(extent),
532 "box" => Token::Box(extent),
533 "break" => Token::Break(extent),
534 "const" => Token::Const(extent),
535 "continue" => Token::Continue(extent),
536 "crate" => Token::Crate(extent),
537 "default" => Token::Default(extent),
538 "dyn" => Token::Dyn(extent),
539 "else" => Token::Else(extent),
540 "enum" => Token::Enum(extent),
541 "extern" => Token::Extern(extent),
542 "fn" => Token::Fn(extent),
543 "for" => Token::For(extent),
544 "if" => Token::If(extent),
545 "impl" => Token::Impl(extent),
546 "in" => Token::In(extent),
547 "let" => Token::Let(extent),
548 "loop" => Token::Loop(extent),
549 "match" => Token::Match(extent),
550 "mod" => Token::Mod(extent),
551 "move" => Token::Move(extent),
552 "mut" => Token::Mut(extent),
553 "pub" => Token::Pub(extent),
554 "ref" => Token::Ref(extent),
555 "return" => Token::Return(extent),
556 "self" => Token::SelfIdent(extent),
557 "static" => Token::Static(extent),
558 "struct" => Token::Struct(extent),
559 "trait" => Token::Trait(extent),
560 "type" => Token::Type(extent),
561 "use" => Token::Use(extent),
562 "union" => Token::Union(extent),
563 "unsafe" => Token::Unsafe(extent),
564 "where" => Token::Where(extent),
565 "while" => Token::While(extent),
566 _ => Token::Ident(extent),
567 },
568 )
569}
570
571fn simple_ident<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
572 let idx = ident_len(pt.s);
573 split_point_at_non_zero_offset(pt, idx, Error::ExpectedIdent).map(|(_, e)| e)
574}
575
576fn ident_len<'s>(s: &str) -> usize {
577 let mut ci = s.chars();
578 let mut idx = 0;
579
580 if let Some(c) = ci.next() {
581 if UnicodeXID::is_xid_start(c) || c == '_' {
582 idx += c.len_utf8();
583
584 idx += ci
585 .take_while(|&c| UnicodeXID::is_xid_continue(c))
586 .map(|c| c.len_utf8())
587 .sum::<usize>();
588 }
589 }
590
591 idx
592}
593
594enum NumberPartial {
595 Binary(NumberDetailsPartial),
596 Decimal(NumberDetailsPartial),
597 Hexadecimal(NumberDetailsPartial),
598 Octal(NumberDetailsPartial),
599}
600
601impl NumberPartial {
602 fn finish(self, extent: Extent, exponent: Option<Extent>, type_suffix: Option<Extent>) ->
603 Number
604 {
605 match self {
606 NumberPartial::Binary(v) => {
607 Number::Binary(NumberBinary::finish(v, extent, exponent, type_suffix))
608 },
609 NumberPartial::Decimal(v) => {
610 Number::Decimal(NumberDecimal::finish(v, extent, exponent, type_suffix))
611 },
612 NumberPartial::Hexadecimal(v) => {
613 Number::Hexadecimal(NumberHexadecimal::finish(v, extent, exponent, type_suffix))
614 },
615 NumberPartial::Octal(v) => {
616 Number::Octal(NumberOctal::finish(v, extent, exponent, type_suffix))
617 },
618 }
619 }
620}
621
622struct NumberDetailsPartial {
623 integral: Extent,
624 fractional: Option<Extent>,
625 underscores: usize,
626}
627
628fn number<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Number> {
629 sequence!(pm, pt, {
630 spt = point;
631 value = number_value;
632 exponent = optional(number_exponent);
633 type_suffix = optional(simple_ident);
634 }, |_, pt| value.finish(ex(spt, pt), exponent, type_suffix))
635}
636
637fn number_value<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, NumberPartial> {
638 pm.alternate(pt)
639 .one(map(number_base("0b", 2), NumberPartial::Binary))
640 .one(map(number_base("0x", 16), NumberPartial::Hexadecimal))
641 .one(map(number_base("0o", 8), NumberPartial::Octal))
642 .one(map(number_base("", 10), NumberPartial::Decimal))
643 .finish()
644}
645
646fn number_base<'s>(prefix: &'static str, radix: u32) ->
647 impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, NumberDetailsPartial>
648{
649 move |pm, pt| {
650 sequence!(pm, pt, {
651 _ = literal(prefix);
652 (integral, underscores) = number_digits(radix);
653 fractional = optional(number_fractional(radix));
654 }, |_, _| NumberDetailsPartial { integral, fractional, underscores })
655 }
656}
657
658fn number_fractional<'s>(radix: u32) ->
659 impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, Extent>
660{
661 move |pm, pt| {
662 sequence!(pm, pt, {
663 spt = point;
664 _ = literal(".");
665 _ = not(peek(literal(".")), Error::InvalidFollowForFractionalNumber);
666 _ = not(peek(simple_ident), Error::InvalidFollowForFractionalNumber);
667 _ = optional(number_digits(radix));
668 }, |_, pt| ex(spt, pt))
669 }
670}
671
672fn number_digits<'s>(radix: u32) ->
673 impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, (Extent, usize)>
674{
675 move |_, pt| {
676 let mut underscores = 0;
677 let ci = pt.s.chars();
678 let idx = ci
679 .take_while(|&c| c.is_digit(radix) || c == '_')
680 .inspect(|&c| if c == '_' { underscores += 1 })
681 .map(|c| c.len_utf8())
682 .sum();
683
684 split_point_at_non_zero_offset(pt, idx, Error::ExpectedNumber).map(|(_, e)| (e, underscores))
685 }
686}
687
688fn number_exponent<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
690 pm.alternate(pt)
691 .one(number_exponent_lowercase)
692 .one(number_exponent_uppercase)
693 .finish()
694}
695
696fn number_exponent_lowercase<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
697 sequence!(pm, pt, {
698 _ = literal("e");
699 (value, _) = number_digits(10);
700 }, |_, _| value)
701}
702
703fn number_exponent_uppercase<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
704 sequence!(pm, pt, {
705 _ = literal("E");
706 (value, _) = number_digits(10);
707 }, |_, _| value)
708}
709
710fn whitespace<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
711 let ci = pt.s.chars();
712 let idx = ci.take_while(|&c| {
713 c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\u{200e}' || c == '\u{200f}'
714 }).map(|c| c.len_utf8()).sum();
715
716 split_point_at_non_zero_offset(pt, idx, Error::ExpectedWhitespace).map(|(_, e)| e)
717}
718
719fn comment_or_doc_comment<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Token> {
720 let spt = pt;
721 if pt.s.starts_with("///") && !pt.s.starts_with("////") {
722 let eol = pt.s.find('\n').unwrap_or_else(|| pt.s.len());
723 let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
724 Progress::success(pt, Token::DocCommentOuterLine(ex(spt, pt)))
725 } else if pt.s.starts_with("//!") {
726 let eol = pt.s.find('\n').unwrap_or_else(|| pt.s.len());
727 let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
728 Progress::success(pt, Token::DocCommentInnerLine(ex(spt, pt)))
729 } else if pt.s.starts_with("//") {
730 let eol = pt.s.find('\n').unwrap_or_else(|| pt.s.len());
731 let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
732 Progress::success(pt, Token::CommentLine(ex(spt, pt)))
733 } else if pt.s.starts_with("/**") && !pt.s.starts_with("/***") && !pt.s.starts_with("/**/") {
734 let eol = pt.s[3..].find("*/").map(|x| 3 + x + 2).unwrap_or_else(|| pt.s.len());
735 let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
736 Progress::success(pt, Token::DocCommentOuterBlock(ex(spt, pt)))
737 } else if pt.s.starts_with("/*!") {
738 let eol = pt.s[3..].find("*/").map(|x| 3 + x + 2).unwrap_or_else(|| pt.s.len());
739 let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
740 Progress::success(pt, Token::DocCommentInnerBlock(ex(spt, pt)))
741 } else if pt.s.starts_with("/*") {
742 let eol = pt.s[2..].find("*/").map(|x| 2 + x + 2).unwrap_or_else(|| pt.s.len());
743 let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
744 Progress::success(pt, Token::CommentBlock(ex(spt, pt)))
745 } else {
746 Progress::failure(pt, Error::ExpectedComment)
747 }
748}
749
750fn character<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
751 sequence!(pm, pt, {
752 spt = point;
753 _ = literal("'");
754 _ = character_char;
755 _ = literal("'");
756 }, |_, pt| ex(spt, pt))
757}
758
759fn character_char<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
760 pm.alternate(pt)
761 .one(escaped_char)
762 .one(single_char)
763 .finish()
764}
765
766fn escaped_char<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
767 sequence!(pm, pt, {
768 spt = point;
769 _ = literal("\\");
770 _ = escaped_char_code;
771 }, |_, pt| spt.to(pt))
772}
773
774fn escaped_char_code<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
775 pm.alternate(pt)
776 .one(literal("n"))
777 .one(literal("r"))
778 .one(literal("t"))
779 .one(literal("\\"))
780 .one(literal("'"))
781 .one(literal("\""))
782 .one(literal("0"))
783 .one(escaped_char_hex)
784 .one(escaped_char_unicode)
785 .finish()
786}
787
788fn escaped_char_hex<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
789 sequence!(pm, pt, {
790 spt = point;
791 _ = literal("x");
792 _ = hex_string;
793 }, |_, pt| ex(spt, pt))
794}
795
796fn escaped_char_unicode<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
797 sequence!(pm, pt, {
798 spt = point;
799 _ = literal("u{");
800 _ = hex_string;
801 _ = literal("}");
802 }, |_, pt| ex(spt, pt))
803}
804
805fn hex_string<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
806 let ci = pt.s.chars();
807 let idx = ci.take_while(|c| c.is_digit(16)).map(|c| c.len_utf8()).sum();
808
809 let idx = if idx == 0 { None } else { Some(idx) };
810 pt.consume_to(idx).map_err(|_| Error::ExpectedHex)
811}
812
813fn single_char<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
814 match pt.s.char_indices().next() {
815 Some((_, c)) => {
816 let i = c.len_utf8();
817 let (head, tail) = pt.s.split_at(i);
818 let pt = Point { s: tail, offset: pt.offset + i };
819 Progress::success(pt, head)
820 }
821 None => {
822 Progress::failure(pt, Error::ExpectedCharacter)
823 }
824 }
825}
826
827fn string<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
828 sequence!(pm, pt, {
829 spt = point;
830 _ = literal("\"");
831 _ = string_char;
832 _ = literal("\"");
833 }, |_, pt| ex(spt, pt))
834}
835
836fn string_char<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
837 let res = |i| {
838 let (head, tail) = pt.s.split_at(i);
839 let pt = Point { s: tail, offset: pt.offset + i };
840 Progress::success(pt, head)
841 };
842
843 let mut escaped = false;
844 for (i, c) in pt.s.char_indices() {
845 match (escaped, c) {
846 (true, _) => escaped = false,
847 (false, '\\') => escaped = true,
848 (false, '"') => return res(i),
849 (false, _) => { },
850 }
851 }
852
853 res(pt.s.len())
854}
855
856fn string_raw<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
857 sequence!(pm, pt, {
858 spt = point;
859 _ = literal("r");
860 h = zero_or_more(literal("#"));
861 _ = literal(r#"""#);
862 _ = raw_string_tail(h.len());
863 }, |_, pt| ex(spt, pt))
864}
865
866fn raw_string_tail<'s>(hashes: usize) -> impl Fn(&mut Master<'s>, Point<'s>) ->
867 Progress<'s, &'s str>
868{
869 let mut s = r#"""#.to_string();
870 for _ in 0..hashes { s.push('#') };
871
872 move |_, pt| {
873 match pt.s.find(&s) {
874 Some(end) => {
875 let (str_content, quote_tail) = pt.s.split_at(end);
876 let (_quotes, tail) = quote_tail.split_at(s.len());
877 let pt = Point { s: tail, offset: pt.offset + end + s.len() };
878 Progress::success(pt, str_content)
879 }
880 None => {
881 Progress::failure(pt, Error::UnterminatedRawString)
882 }
883 }
884 }
885}
886
887fn byte<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
888 sequence!(pm, pt, {
889 spt = point;
890 _ = literal("b");
891 _ = character;
892 }, |_, pt| ex(spt, pt))
893}
894
895fn byte_string<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
896 sequence!(pm, pt, {
897 spt = point;
898 _ = literal("b");
899 _ = string;
900 }, |_, pt| ex(spt, pt))
901}
902
903fn byte_string_raw<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
904 sequence!(pm, pt, {
905 spt = point;
906 _ = literal("b");
907 _ = string_raw;
908 }, |_, pt| ex(spt, pt))
909}
910
911fn lifetime<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
912 sequence!(pm, pt, {
913 spt = point;
914 _ = literal("'");
915 _ = simple_ident;
916 }, |_, pt| ex(spt, pt))
917}
918
919fn literal<'s>(expected: &'static str) ->
920 impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, Extent>
921{
922 move |_, spt| {
923 let (pt, _) = try_parse!(spt.consume_literal(expected).map_err(|_| Error::Literal(expected)));
924 Progress::success(pt, ex(spt, pt))
925 }
926}
927
928fn ex(start: Point, end: Point) -> Extent {
929 let ex = Extent(start.offset, end.offset);
930 assert!(ex.1 >= ex.0, "{} does not come before {}", ex.1, ex.0);
931 ex
932}
933
934fn split_point_at_non_zero_offset(pt: Point<'_>, idx: usize, e: Error) ->
935 Progress<'_, (&'_ str, Extent)>
936{
937 if idx == 0 {
938 peresil::Progress::failure(pt, e)
939 } else {
940 let (matched, tail) = pt.s.split_at(idx);
941 let end = pt.offset + idx;
942 let end_pt = Point { s: tail, offset: end };
943
944 peresil::Progress::success(end_pt, (matched, Extent(pt.offset, end)))
945 }
946}
947
948#[cfg(test)]
949mod test {
950 use super::*;
951
952 macro_rules! tokenize_as {
953 ($input:expr, $p:path) => ({
954 let toks = tok($input);
955 unwrap_as!(toks[0], $p)
956 })
957 }
958
959 fn tok(s: &str) -> Vec<Token> {
960 tok_full(s).expect("Tokenization failed")
961 }
962
963 fn tok_full(s: &str) -> Result<Vec<Token>, ErrorDetail> {
964 Tokens::new(s).collect()
965 }
966
967 #[test]
968 fn keyword_is_not_an_ident() {
969 let s = tokenize_as!("for", Token::For);
970 assert_eq!(s, (0, 3))
971 }
972
973 #[test]
974 fn ident_can_have_keyword_substring() {
975 let s = tokenize_as!("form", Token::Ident);
976 assert_eq!(s, (0, 4))
977 }
978
979 #[test]
980 fn raw_idents_can_be_keywords() {
981 let s = tokenize_as!("r#for", Token::Ident);
982 assert_eq!(s, (0, 5))
983 }
984
985 #[test]
986 fn raw_idents_require_some_identifier() {
987 let tokens = tok_full("r#").unwrap_err();
988 assert!(tokens
989 .errors
990 .contains(&Error::RawIdentifierMissingIdentifier));
991 }
992
993 #[test]
994 fn character() {
995 let s = tokenize_as!("'a'", Token::Character);
996 assert_eq!(s, (0, 3));
997 }
998
999 #[test]
1000 fn character_escaped() {
1001 let s = tokenize_as!(r#"'\\'"#, Token::Character);
1002 assert_eq!(s, (0, 4));
1003 }
1004
1005 #[test]
1006 fn character_escaped_hex() {
1007 let s = tokenize_as!(r#"'\x41'"#, Token::Character);
1008 assert_eq!(s, (0, 6));
1009 }
1010
1011 #[test]
1012 fn character_escaped_unicode() {
1013 let s = tokenize_as!(r#"'\u{1F63B}'"#, Token::Character);
1014 assert_eq!(s, (0, 11));
1015 }
1016
1017 #[test]
1018 fn character_limited_to_single() {
1019 let toks = tok("impl<'a> Foo<'a> for Bar<'a> { }");
1020
1021 let s = unwrap_as!(toks[2], Token::Lifetime);
1022 assert_eq!(s, (5, 7));
1023
1024 let s = unwrap_as!(toks[7], Token::Lifetime);
1025 assert_eq!(s, (13, 15));
1026
1027 let s = unwrap_as!(toks[14], Token::Lifetime);
1028 assert_eq!(s, (25, 27));
1029 }
1030
1031 #[test]
1032 fn string_raw() {
1033 let s = tokenize_as!(r###"r#"inner"#"###, Token::StringRaw);
1034 assert_eq!(s, (0, 10));
1035 }
1036
1037 #[test]
1038 fn byte() {
1039 let s = tokenize_as!(r#"b'a'"#, Token::Byte);
1040 assert_eq!(s, (0, 4));
1041 }
1042
1043 #[test]
1044 fn byte_string() {
1045 let s = tokenize_as!(r#"b"abc""#, Token::ByteString);
1046 assert_eq!(s, (0, 6));
1047 }
1048
1049 #[test]
1050 fn byte_string_raw() {
1051 let s = tokenize_as!(r#"br"abc""#, Token::ByteStringRaw);
1052 assert_eq!(s, (0, 7));
1053 }
1054
1055 #[test]
1056 fn tilde_is_a_token_even_though_unused() {
1057 let s = tokenize_as!("~", Token::Tilde);
1058 assert_eq!(s, (0, 1));
1059 }
1060
1061 #[test]
1062 fn number_binary() {
1063 let s = tokenize_as!("0b0101", Token::Number);
1064 assert_eq!(s.extent(), (0, 6));
1065 let n = unwrap_as!(s, Number::Binary);
1066 assert_eq!(n.integral, (2, 6));
1067 }
1068
1069 #[test]
1070 fn number_decimal() {
1071 let s = tokenize_as!("123456", Token::Number);
1072 assert_eq!(s.extent(), (0, 6));
1073 let n = unwrap_as!(s, Number::Decimal);
1074 assert_eq!(n.integral, (0, 6));
1075 let n = s.into_simple();
1076 assert_eq!(n, Some(Extent(0, 6)));
1077 }
1078
1079 #[test]
1080 fn number_hexadecimal() {
1081 let s = tokenize_as!("0xBeeF", Token::Number);
1082 assert_eq!(s.extent(), (0, 6));
1083 let n = unwrap_as!(s, Number::Hexadecimal);
1084 assert_eq!(n.integral, (2, 6));
1085 }
1086
1087 #[test]
1088 fn number_octal() {
1089 let s = tokenize_as!("0o0777", Token::Number);
1090 assert_eq!(s.extent(), (0, 6));
1091 let n = unwrap_as!(s, Number::Octal);
1092 assert_eq!(n.integral, (2, 6));
1093 }
1094
1095 #[test]
1096 fn number_decimal_with_decimal() {
1097 let s = tokenize_as!("0.", Token::Number);
1098 assert_eq!(s.extent(), (0, 2));
1099 let n = unwrap_as!(s, Number::Decimal);
1100 assert_eq!(n.integral, (0, 1));
1101 assert_eq!(n.fractional, Some(Extent(1, 2)));
1102 }
1103
1104 #[test]
1105 fn number_with_decimal() {
1106 let s = tokenize_as!("0xA.", Token::Number);
1107 assert_eq!(s.extent(), (0, 4));
1108 let n = unwrap_as!(s, Number::Hexadecimal);
1109 assert_eq!(n.integral, (2, 3));
1110 assert_eq!(n.fractional, Some(Extent(3, 4)));
1111 }
1112
1113 #[test]
1114 fn number_with_fractional_part() {
1115 let s = tokenize_as!("0b01.10", Token::Number);
1116 assert_eq!(s.extent(), (0, 7));
1117 let n = unwrap_as!(s, Number::Binary);
1118 assert_eq!(n.integral, (2, 4));
1119 assert_eq!(n.fractional, Some(Extent(4, 7)));
1120 }
1121
1122 #[test]
1123 fn number_with_exponent() {
1124 let s = tokenize_as!("0b1000E7", Token::Number);
1125 assert_eq!(s.extent(), (0, 8));
1126 let n = unwrap_as!(s, Number::Binary);
1127 assert_eq!(n.integral, (2, 6));
1128 assert_eq!(n.exponent, Some(Extent(7, 8)));
1129 }
1130
1131 #[test]
1132 fn number_with_type_suffix() {
1133 let s = tokenize_as!("0o1234_usize", Token::Number);
1134 assert_eq!(s.extent(), (0, 12));
1135 let n = unwrap_as!(s, Number::Octal);
1136 assert_eq!(n.integral, (2, 7));
1137 assert_eq!(n.type_suffix, Some(Extent(7, 12)));
1138 }
1139
1140 #[test]
1141 fn number_with_spacers() {
1142 let s = tokenize_as!("0x0A_1b_2C_3d", Token::Number);
1143 assert_eq!(s.extent(), (0, 13));
1144 let n = unwrap_as!(s, Number::Hexadecimal);
1145 assert_eq!(n.integral, (2, 13));
1146 }
1147
1148 #[test]
1149 fn number_decimal_with_spacers() {
1150 let s = tokenize_as!("01_23", Token::Number);
1151 assert_eq!(s.extent(), (0, 5));
1152 let n = unwrap_as!(s, Number::Decimal);
1153 assert_eq!(n.integral, (0, 5));
1154 }
1155
1156 #[test]
1157 fn number_with_everything() {
1158 let s = tokenize_as!("0o__12__56__.43__e__32__my_type", Token::Number);
1159 assert_eq!(s.extent(), (0, 31));
1160 let n = unwrap_as!(s, Number::Octal);
1161 assert_eq!(n.integral, (2, 12));
1162 assert_eq!(n.fractional, Some(Extent(12, 17)));
1163 assert_eq!(n.exponent, Some(Extent(18, 24)));
1164 assert_eq!(n.type_suffix, Some(Extent(24, 31)));
1165 }
1166
1167 #[test]
1168 fn number_decimal_with_leading_spacer_is_an_ident() {
1169 let s = tokenize_as!("_42", Token::Ident);
1170 assert_eq!(s, (0, 3));
1171 }
1172
1173 #[test]
1174 fn number_followed_by_range_is_not_fractional() {
1175 let toks = tok("1..2");
1176
1177 let s = unwrap_as!(toks[0], Token::Number);
1178 assert_eq!(s.extent(), (0, 1));
1179
1180 let s = unwrap_as!(toks[1], Token::DoublePeriod);
1181 assert_eq!(s, (1, 3));
1182
1183 let s = unwrap_as!(toks[2], Token::Number);
1184 assert_eq!(s.extent(), (3, 4));
1185 }
1186
1187 #[test]
1188 fn number_followed_by_ident_is_not_fractional() {
1189 let toks = tok("1.foo");
1190
1191 let s = unwrap_as!(toks[0], Token::Number);
1192 assert_eq!(s.extent(), (0, 1));
1193
1194 let s = unwrap_as!(toks[1], Token::Period);
1195 assert_eq!(s, (1, 2));
1196
1197 let s = unwrap_as!(toks[2], Token::Ident);
1198 assert_eq!(s, (2, 5));
1199 }
1200
1201 #[test]
1202 fn whitespace_unicode_direction_markers() {
1203 let s = tokenize_as!("\u{200e}\u{200f}", Token::Whitespace);
1204 assert_eq!(s, (0, 6))
1205 }
1206
1207 #[test]
1208 fn comment_block() {
1209 let s = tokenize_as!("/* hi */", Token::CommentBlock);
1210 assert_eq!(s, (0, 8))
1211 }
1212
1213 #[test]
1214 fn comment_block_not_immediately_closed() {
1215 let s = tokenize_as!("/*/ */", Token::CommentBlock);
1216 assert_eq!(s, (0, 6))
1217 }
1218
1219 #[test]
1220 fn comment_block_immediately_closed() {
1221 let s = tokenize_as!("/**/", Token::CommentBlock);
1222 assert_eq!(s, (0, 4))
1223 }
1224
1225 #[test]
1226 fn doc_comment_outer_block() {
1227 let s = tokenize_as!("/** hi */", Token::DocCommentOuterBlock);
1228 assert_eq!(s, (0, 9))
1229 }
1230
1231 #[test]
1232 fn doc_comment_inner_block() {
1233 let s = tokenize_as!("/*! hi */", Token::DocCommentInnerBlock);
1234 assert_eq!(s, (0, 9))
1235 }
1236
1237 #[test]
1238 fn doc_comment_outer_line() {
1239 let s = tokenize_as!("/// hi", Token::DocCommentOuterLine);
1240 assert_eq!(s, (0, 6))
1241 }
1242
1243 #[test]
1244 fn doc_comment_inner_line() {
1245 let s = tokenize_as!("//! hi", Token::DocCommentInnerLine);
1246 assert_eq!(s, (0, 6))
1247 }
1248
1249 #[test]
1250 fn end_of_file() {
1251 let s = tokenize_as!("", Token::EndOfFile);
1252 assert_eq!(s, (0, 0))
1253 }
1254}