fuzzy_pickles/
tokenizer.rs

1use std::collections::BTreeSet;
2use std::fmt;
3
4use unicode_xid::UnicodeXID;
5use peresil::combinators::*;
6
7use crate::{Extent, HumanTextError};
8use crate::combinators::{not, peek};
9
10#[derive(Debug, Copy, Clone, PartialEq, Eq, Decompose)]
11pub enum Token {
12    // Paired delimiters
13    LeftAngle(Extent),
14    LeftCurly(Extent),
15    LeftParen(Extent),
16    LeftSquare(Extent),
17    RightAngle(Extent),
18    RightCurly(Extent),
19    RightParen(Extent),
20    RightSquare(Extent),
21
22    // Symbols
23    //
24    // TODO: Decide how to name the foo-equals tokens.
25    // Should they match (e.g. caret and caret-equals)?
26    // Should they infer any meaning (e.g. xor)
27    Ampersand(Extent),
28    AmpersandEquals(Extent),
29    Asterisk(Extent),
30    At(Extent),
31    Backslash(Extent),
32    Bang(Extent),
33    Caret(Extent),
34    CaretEquals(Extent),
35    Colon(Extent),
36    Comma(Extent),
37    DivideEquals(Extent),
38    Dollar(Extent),
39    DoubleAmpersand(Extent),
40    DoubleColon(Extent),
41    DoubleEquals(Extent),
42    DoubleLeftAngle(Extent),
43    DoublePeriod(Extent),
44    DoublePeriodEquals(Extent),
45    DoublePipe(Extent),
46    DoubleRightAngle(Extent),
47    Equals(Extent),
48    GreaterThanOrEquals(Extent),
49    Hash(Extent),
50    LessThanOrEquals(Extent),
51    Minus(Extent),
52    MinusEquals(Extent),
53    NotEqual(Extent),
54    Percent(Extent),
55    PercentEquals(Extent),
56    Period(Extent),
57    Pipe(Extent),
58    PipeEquals(Extent),
59    Plus(Extent),
60    PlusEquals(Extent),
61    QuestionMark(Extent),
62    Semicolon(Extent),
63    ShiftLeftEquals(Extent),
64    ShiftRightEquals(Extent),
65    Slash(Extent),
66    ThickArrow(Extent),
67    ThinArrow(Extent),
68    Tilde(Extent),
69    TimesEquals(Extent),
70    TriplePeriod(Extent),
71
72    // Keywords
73    As(Extent),
74    Async(Extent),
75    Auto(Extent),
76    Box(Extent),
77    Break(Extent),
78    Const(Extent),
79    Continue(Extent),
80    Crate(Extent),
81    Default(Extent),
82    Dyn(Extent),
83    Else(Extent),
84    Enum(Extent),
85    Extern(Extent),
86    Fn(Extent),
87    For(Extent),
88    If(Extent),
89    Impl(Extent),
90    In(Extent),
91    Let(Extent),
92    Loop(Extent),
93    Match(Extent),
94    Mod(Extent),
95    Move(Extent),
96    Mut(Extent),
97    Pub(Extent),
98    Ref(Extent),
99    Return(Extent),
100    SelfIdent(Extent),
101    Static(Extent),
102    Struct(Extent),
103    Trait(Extent),
104    Type(Extent),
105    Union(Extent),
106    Unsafe(Extent),
107    Use(Extent),
108    Where(Extent),
109    While(Extent),
110
111    // String-like
112    Character(Extent),
113    String(Extent),
114    StringRaw(Extent),
115    Byte(Extent),
116    ByteString(Extent),
117    ByteStringRaw(Extent),
118
119    // Other
120    Ident(Extent),
121    Number(Number),
122    Whitespace(Extent),
123    CommentLine(Extent),
124    CommentBlock(Extent),
125    DocCommentOuterLine(Extent),
126    DocCommentInnerLine(Extent),
127    DocCommentOuterBlock(Extent),
128    DocCommentInnerBlock(Extent),
129    Lifetime(Extent),
130    EndOfFile(Extent),
131}
132
133impl Token {
134    pub fn extent(&self) -> Extent {
135        use self::Token::*;
136
137        match *self {
138            Ampersand(s)           |
139            AmpersandEquals(s)     |
140            As(s)                  |
141            Asterisk(s)            |
142            Async(s)               |
143            At(s)                  |
144            Auto(s)                |
145            Backslash(s)           |
146            Bang(s)                |
147            Box(s)                 |
148            Break(s)               |
149            Byte(s)                |
150            ByteString(s)          |
151            ByteStringRaw(s)       |
152            Caret(s)               |
153            CaretEquals(s)         |
154            Character(s)           |
155            Colon(s)               |
156            Comma(s)               |
157            CommentLine(s)         |
158            CommentBlock(s)        |
159            Const(s)               |
160            Continue(s)            |
161            Crate(s)               |
162            Dyn(s)                 |
163            Default(s)             |
164            DivideEquals(s)        |
165            DocCommentInnerLine(s) |
166            DocCommentInnerBlock(s)|
167            DocCommentOuterLine(s) |
168            DocCommentOuterBlock(s)|
169            Dollar(s)              |
170            DoubleAmpersand(s)     |
171            DoubleColon(s)         |
172            DoubleEquals(s)        |
173            DoubleLeftAngle(s)     |
174            DoublePeriod(s)        |
175            DoublePeriodEquals(s)  |
176            DoublePipe(s)          |
177            DoubleRightAngle(s)    |
178            Else(s)                |
179            EndOfFile(s)           |
180            Enum(s)                |
181            Equals(s)              |
182            Extern(s)              |
183            Fn(s)                  |
184            For(s)                 |
185            GreaterThanOrEquals(s) |
186            Hash(s)                |
187            Ident(s)               |
188            If(s)                  |
189            Impl(s)                |
190            In(s)                  |
191            LeftAngle(s)           |
192            LeftCurly(s)           |
193            LeftParen(s)           |
194            LeftSquare(s)          |
195            LessThanOrEquals(s)    |
196            Let(s)                 |
197            Lifetime(s)            |
198            Loop(s)                |
199            Match(s)               |
200            Minus(s)               |
201            MinusEquals(s)         |
202            Mod(s)                 |
203            Move(s)                |
204            Mut(s)                 |
205            NotEqual(s)            |
206            Percent(s)             |
207            PercentEquals(s)       |
208            Period(s)              |
209            Pipe(s)                |
210            PipeEquals(s)          |
211            Plus(s)                |
212            PlusEquals(s)          |
213            Pub(s)                 |
214            QuestionMark(s)        |
215            Ref(s)                 |
216            Return(s)              |
217            RightAngle(s)          |
218            RightCurly(s)          |
219            RightParen(s)          |
220            RightSquare(s)         |
221            SelfIdent(s)           |
222            Semicolon(s)           |
223            ShiftLeftEquals(s)     |
224            ShiftRightEquals(s)    |
225            Slash(s)               |
226            Static(s)              |
227            String(s)              |
228            StringRaw(s)           |
229            Struct(s)              |
230            ThickArrow(s)          |
231            ThinArrow(s)           |
232            Tilde(s)               |
233            TimesEquals(s)         |
234            Trait(s)               |
235            TriplePeriod(s)        |
236            Type(s)                |
237            Union(s)               |
238            Unsafe(s)              |
239            Use(s)                 |
240            Where(s)               |
241            While(s)               |
242            Whitespace(s)          => s,
243
244            Number(s) => s.extent(),
245        }
246    }
247}
248
249#[derive(Debug, Copy, Clone, PartialEq, Eq, Decompose)]
250pub enum Number {
251    Binary(NumberBinary),
252    Decimal(NumberDecimal),
253    Hexadecimal(NumberHexadecimal),
254    Octal(NumberOctal),
255}
256
257impl Number {
258    fn extent(&self) -> Extent {
259        use self::Number::*;
260
261        match *self {
262            Binary(n) => n.extent(),
263            Decimal(n) => n.extent(),
264            Hexadecimal(n) => n.extent(),
265            Octal(n) => n.extent(),
266        }
267    }
268
269    pub fn into_simple(self) -> Option<Extent> {
270        match self {
271            Number::Decimal(d) => {
272                if d.fractional.is_none() &&
273                    d.exponent.is_none() &&
274                    d.type_suffix.is_none() &&
275                    d.underscores == 0
276                {
277                    Some(d.extent)
278                } else {
279                    None
280                }
281            }
282            _ => None
283        }
284    }
285}
286
287macro_rules! number {
288    ($name:ident) => {
289        #[derive(Debug, Copy, Clone, PartialEq, Eq)]
290        pub struct $name {
291            pub extent: Extent,
292            pub integral: Extent,
293            pub fractional: Option<Extent>,
294            pub exponent: Option<Extent>,
295            pub type_suffix: Option<Extent>,
296            underscores: usize,
297        }
298
299        impl $name {
300            fn finish(details: NumberDetailsPartial,
301                      extent: Extent,
302                      exponent: Option<Extent>,
303                      type_suffix: Option<Extent>) -> $name
304            {
305                let NumberDetailsPartial { integral, fractional, underscores } = details;
306                $name { extent, integral, fractional, exponent, type_suffix, underscores }
307            }
308
309            pub fn extent(&self) -> Extent {
310                self.extent
311            }
312        }
313    }
314}
315
316number!(NumberBinary);
317number!(NumberDecimal);
318number!(NumberHexadecimal);
319number!(NumberOctal);
320
321#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
322pub(crate) enum Error {
323    Literal(&'static str),
324    ExpectedIdentOrKeyword,
325    ExpectedIdent,
326    ExpectedNumber,
327    ExpectedHex,
328    ExpectedWhitespace,
329    ExpectedComment,
330    ExpectedCharacter,
331    UnterminatedRawString,
332    RawIdentifierMissingIdentifier,
333
334    // Internal parsing errors, should be recovered
335    InvalidFollowForFractionalNumber,
336}
337
338impl peresil::Recoverable for Error {
339    fn recoverable(&self) -> bool {
340        use Error::*;
341
342        match self {
343            RawIdentifierMissingIdentifier => false,
344            _ => true,
345        }
346    }
347}
348
349/// Information about a tokenization error
350#[derive(Debug, PartialEq, Eq)]
351pub struct ErrorDetail {
352    location: usize,
353    errors: BTreeSet<Error>,
354}
355
356impl ErrorDetail {
357    /// Enhance the error with the source code
358    pub fn with_text<'a>(&'a self, text: &'a str) -> ErrorDetailText<'a> {
359        ErrorDetailText { detail: self, text }
360    }
361}
362
363/// Information about a tokenization error including original source code
364#[derive(Debug)]
365pub struct ErrorDetailText<'a> {
366    detail: &'a ErrorDetail,
367    text: &'a str,
368}
369
370impl<'a> fmt::Display for ErrorDetailText<'a> {
371    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
372        let human = HumanTextError::new(self.text, self.detail.location);
373
374        writeln!(f, "Unable to tokenize text (line {}, column {})", human.line, human.column)?;
375        writeln!(f, "{}{}", human.head_of_line, human.tail_of_line)?;
376        writeln!(f, "{:>width$}", "^", width = human.column)?;
377        writeln!(f, "Expected:")?;
378        for e in &self.detail.errors {
379            writeln!(f, "  {:?}", e)?; // TODO: should be Display
380        }
381        Ok(())
382    }
383}
384
385type Point<'s> = peresil::StringPoint<'s>;
386type Master<'s> = peresil::ParseMaster<Point<'s>, Error>;
387type Progress<'s, T> = peresil::Progress<Point<'s>, T, Error>;
388
389pub struct Tokens<'s> {
390    pm: Master<'s>,
391    pt: Point<'s>,
392    is_exhausted: bool,
393}
394
395impl<'s> Tokens<'s> {
396    pub fn new(code: &'s str) -> Self {
397        Tokens {
398            pm: Master::new(),
399            pt: Point::new(code),
400            is_exhausted: false,
401        }
402    }
403}
404
405impl<'s> Iterator for Tokens<'s> {
406    type Item = Result<Token, ErrorDetail>;
407
408    fn next(&mut self) -> Option<Self::Item> {
409        if self.is_exhausted {
410            return None
411        }
412
413        if self.pt.s.is_empty() {
414            self.is_exhausted = true;
415            return Some(Ok(Token::EndOfFile(Extent(self.pt.offset, self.pt.offset))));
416        }
417
418        let tok = single_token(&mut self.pm, self.pt);
419        let tok = self.pm.finish(tok);
420
421        match tok {
422            peresil::Progress { status: peresil::Status::Success(value), point } => {
423                assert_ne!(self.pt.offset, point.offset, "Tokenizer did not make progress");
424                self.pt = point;
425                Some(Ok(value))
426            }
427            peresil::Progress { status: peresil::Status::Failure(errors), point } => {
428                Some(Err(ErrorDetail {
429                    location: point.offset,
430                    errors: errors.into_iter().collect(),
431                }))
432            }
433        }
434    }
435}
436
437fn single_token<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Token> {
438    pm.alternate(pt)
439        .one(comment_or_doc_comment)
440        .one(map(character, Token::Character))
441        .one(map(string, Token::String))
442        .one(map(string_raw, Token::StringRaw))
443        .one(map(byte, Token::Byte))
444        .one(map(byte_string, Token::ByteString))
445        .one(map(byte_string_raw, Token::ByteStringRaw))
446        .one(map(lifetime, Token::Lifetime))
447
448        // Symbols; longest first
449        .one(map(literal(">>="), Token::ShiftRightEquals))
450        .one(map(literal("<<="), Token::ShiftLeftEquals))
451        .one(map(literal("..."), Token::TriplePeriod))
452        .one(map(literal("..="), Token::DoublePeriodEquals))
453
454        // Symbols - 2 character
455        .one(map(literal("!="), Token::NotEqual))
456        .one(map(literal("%="), Token::PercentEquals))
457        .one(map(literal("&&"), Token::DoubleAmpersand))
458        .one(map(literal("&="), Token::AmpersandEquals))
459        .one(map(literal("*="), Token::TimesEquals))
460        .one(map(literal("+="), Token::PlusEquals))
461        .one(map(literal("-="), Token::MinusEquals))
462        .one(map(literal("->"), Token::ThinArrow))
463        .one(map(literal("/="), Token::DivideEquals))
464        .one(map(literal("<<"), Token::DoubleLeftAngle))
465        .one(map(literal("<="), Token::LessThanOrEquals))
466        .one(map(literal("=="), Token::DoubleEquals))
467        .one(map(literal("=>"), Token::ThickArrow))
468        .one(map(literal(">="), Token::GreaterThanOrEquals))
469        .one(map(literal(">>"), Token::DoubleRightAngle))
470        .one(map(literal("^="), Token::CaretEquals))
471        .one(map(literal("|="), Token::PipeEquals))
472        .one(map(literal(".."), Token::DoublePeriod))
473        .one(map(literal("::"), Token::DoubleColon))
474        .one(map(literal("||"), Token::DoublePipe))
475
476        // Symbols - 1 character
477        .one(map(literal("!"), Token::Bang))
478        .one(map(literal("#"), Token::Hash))
479        .one(map(literal("$"), Token::Dollar))
480        .one(map(literal("%"), Token::Percent))
481        .one(map(literal("&"), Token::Ampersand))
482        .one(map(literal("*"), Token::Asterisk))
483        .one(map(literal("+"), Token::Plus))
484        .one(map(literal(","), Token::Comma))
485        .one(map(literal("-"), Token::Minus))
486        .one(map(literal("."), Token::Period))
487        .one(map(literal("/"), Token::Slash))
488        .one(map(literal(":"), Token::Colon))
489        .one(map(literal(";"), Token::Semicolon))
490        .one(map(literal("="), Token::Equals))
491        .one(map(literal("?"), Token::QuestionMark))
492        .one(map(literal("@"), Token::At))
493        .one(map(literal("^"), Token::Caret))
494        .one(map(literal("|"), Token::Pipe))
495        .one(map(literal("~"), Token::Tilde))
496        .one(map(literal(r#"\"#), Token::Backslash))
497
498        // Paired delimiters
499        .one(map(literal("("), Token::LeftParen))
500        .one(map(literal(")"), Token::RightParen))
501        .one(map(literal("<"), Token::LeftAngle))
502        .one(map(literal(">"), Token::RightAngle))
503        .one(map(literal("["), Token::LeftSquare))
504        .one(map(literal("]"), Token::RightSquare))
505        .one(map(literal("{"), Token::LeftCurly))
506        .one(map(literal("}"), Token::RightCurly))
507
508        // Specialty items
509        .one(keyword_or_ident)
510        .one(map(number, Token::Number))
511        .one(map(whitespace, Token::Whitespace))
512        .finish()
513}
514
515fn keyword_or_ident<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Token> {
516    if pt.s.starts_with("r#") {
517        let idx = ident_len(&pt.s[2..]);
518        if idx == 0 {
519            return Progress::failure(pt, Error::RawIdentifierMissingIdentifier);
520        }
521        return split_point_at_non_zero_offset(pt, 2 + idx, Error::ExpectedIdentOrKeyword)
522            .map(|(_, extent)| Token::Ident(extent));
523    }
524
525    let idx = ident_len(pt.s);
526
527    split_point_at_non_zero_offset(pt, idx, Error::ExpectedIdentOrKeyword).map(
528        |(s, extent)| match s {
529            "as" => Token::As(extent),
530            "async" => Token::Async(extent),
531            "auto" => Token::Auto(extent),
532            "box" => Token::Box(extent),
533            "break" => Token::Break(extent),
534            "const" => Token::Const(extent),
535            "continue" => Token::Continue(extent),
536            "crate" => Token::Crate(extent),
537            "default" => Token::Default(extent),
538            "dyn" => Token::Dyn(extent),
539            "else" => Token::Else(extent),
540            "enum" => Token::Enum(extent),
541            "extern" => Token::Extern(extent),
542            "fn" => Token::Fn(extent),
543            "for" => Token::For(extent),
544            "if" => Token::If(extent),
545            "impl" => Token::Impl(extent),
546            "in" => Token::In(extent),
547            "let" => Token::Let(extent),
548            "loop" => Token::Loop(extent),
549            "match" => Token::Match(extent),
550            "mod" => Token::Mod(extent),
551            "move" => Token::Move(extent),
552            "mut" => Token::Mut(extent),
553            "pub" => Token::Pub(extent),
554            "ref" => Token::Ref(extent),
555            "return" => Token::Return(extent),
556            "self" => Token::SelfIdent(extent),
557            "static" => Token::Static(extent),
558            "struct" => Token::Struct(extent),
559            "trait" => Token::Trait(extent),
560            "type" => Token::Type(extent),
561            "use" => Token::Use(extent),
562            "union" => Token::Union(extent),
563            "unsafe" => Token::Unsafe(extent),
564            "where" => Token::Where(extent),
565            "while" => Token::While(extent),
566            _ => Token::Ident(extent),
567        },
568    )
569}
570
571fn simple_ident<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
572    let idx = ident_len(pt.s);
573    split_point_at_non_zero_offset(pt, idx, Error::ExpectedIdent).map(|(_, e)| e)
574}
575
576fn ident_len<'s>(s: &str) -> usize {
577    let mut ci = s.chars();
578    let mut idx = 0;
579
580    if let Some(c) = ci.next() {
581        if UnicodeXID::is_xid_start(c) || c == '_' {
582            idx += c.len_utf8();
583
584            idx += ci
585                .take_while(|&c| UnicodeXID::is_xid_continue(c))
586                .map(|c| c.len_utf8())
587                .sum::<usize>();
588        }
589    }
590
591    idx
592}
593
594enum NumberPartial {
595    Binary(NumberDetailsPartial),
596    Decimal(NumberDetailsPartial),
597    Hexadecimal(NumberDetailsPartial),
598    Octal(NumberDetailsPartial),
599}
600
601impl NumberPartial {
602    fn finish(self, extent: Extent, exponent: Option<Extent>, type_suffix: Option<Extent>) ->
603        Number
604    {
605        match self {
606            NumberPartial::Binary(v) => {
607                Number::Binary(NumberBinary::finish(v, extent, exponent, type_suffix))
608            },
609            NumberPartial::Decimal(v) => {
610                Number::Decimal(NumberDecimal::finish(v, extent, exponent, type_suffix))
611            },
612            NumberPartial::Hexadecimal(v) => {
613                Number::Hexadecimal(NumberHexadecimal::finish(v, extent, exponent, type_suffix))
614            },
615            NumberPartial::Octal(v) => {
616                Number::Octal(NumberOctal::finish(v, extent, exponent, type_suffix))
617            },
618        }
619    }
620}
621
622struct NumberDetailsPartial {
623    integral: Extent,
624    fractional: Option<Extent>,
625    underscores: usize,
626}
627
628fn number<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Number> {
629    sequence!(pm, pt, {
630        spt         = point;
631        value       = number_value;
632        exponent    = optional(number_exponent);
633        type_suffix = optional(simple_ident);
634    }, |_, pt| value.finish(ex(spt, pt), exponent, type_suffix))
635}
636
637fn number_value<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, NumberPartial> {
638    pm.alternate(pt)
639        .one(map(number_base("0b", 2), NumberPartial::Binary))
640        .one(map(number_base("0x", 16), NumberPartial::Hexadecimal))
641        .one(map(number_base("0o", 8), NumberPartial::Octal))
642        .one(map(number_base("", 10), NumberPartial::Decimal))
643        .finish()
644}
645
646fn number_base<'s>(prefix: &'static str, radix: u32) ->
647    impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, NumberDetailsPartial>
648{
649    move |pm, pt| {
650        sequence!(pm, pt, {
651            _                       = literal(prefix);
652            (integral, underscores) = number_digits(radix);
653            fractional              = optional(number_fractional(radix));
654        }, |_, _| NumberDetailsPartial { integral, fractional, underscores })
655    }
656}
657
658fn number_fractional<'s>(radix: u32) ->
659    impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, Extent>
660{
661    move |pm, pt| {
662        sequence!(pm, pt, {
663            spt = point;
664            _   = literal(".");
665            _   = not(peek(literal(".")), Error::InvalidFollowForFractionalNumber);
666            _   = not(peek(simple_ident), Error::InvalidFollowForFractionalNumber);
667            _   = optional(number_digits(radix));
668        }, |_, pt| ex(spt, pt))
669    }
670}
671
672fn number_digits<'s>(radix: u32) ->
673    impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, (Extent, usize)>
674{
675    move |_, pt| {
676        let mut underscores = 0;
677        let ci = pt.s.chars();
678        let idx = ci
679            .take_while(|&c| c.is_digit(radix) || c == '_')
680            .inspect(|&c| if c == '_' { underscores += 1 })
681            .map(|c| c.len_utf8())
682            .sum();
683
684        split_point_at_non_zero_offset(pt, idx, Error::ExpectedNumber).map(|(_, e)| (e, underscores))
685    }
686}
687
688// TODO: add a case-insensitive matcher?
689fn number_exponent<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
690    pm.alternate(pt)
691        .one(number_exponent_lowercase)
692        .one(number_exponent_uppercase)
693        .finish()
694}
695
696fn number_exponent_lowercase<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
697    sequence!(pm, pt, {
698        _          = literal("e");
699        (value, _) = number_digits(10);
700    }, |_, _| value)
701}
702
703fn number_exponent_uppercase<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
704    sequence!(pm, pt, {
705        _          = literal("E");
706        (value, _) = number_digits(10);
707    }, |_, _| value)
708}
709
710fn whitespace<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
711    let ci = pt.s.chars();
712    let idx = ci.take_while(|&c| {
713        c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\u{200e}' || c == '\u{200f}'
714    }).map(|c| c.len_utf8()).sum();
715
716    split_point_at_non_zero_offset(pt, idx, Error::ExpectedWhitespace).map(|(_, e)| e)
717}
718
719fn comment_or_doc_comment<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Token> {
720    let spt = pt;
721    if pt.s.starts_with("///") && !pt.s.starts_with("////") {
722        let eol = pt.s.find('\n').unwrap_or_else(|| pt.s.len());
723        let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
724        Progress::success(pt, Token::DocCommentOuterLine(ex(spt, pt)))
725    } else if pt.s.starts_with("//!") {
726        let eol = pt.s.find('\n').unwrap_or_else(|| pt.s.len());
727        let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
728        Progress::success(pt, Token::DocCommentInnerLine(ex(spt, pt)))
729    } else if pt.s.starts_with("//") {
730        let eol = pt.s.find('\n').unwrap_or_else(|| pt.s.len());
731        let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
732        Progress::success(pt, Token::CommentLine(ex(spt, pt)))
733    } else if pt.s.starts_with("/**") && !pt.s.starts_with("/***") && !pt.s.starts_with("/**/") {
734        let eol = pt.s[3..].find("*/").map(|x| 3 + x + 2).unwrap_or_else(|| pt.s.len());
735        let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
736        Progress::success(pt, Token::DocCommentOuterBlock(ex(spt, pt)))
737    } else if pt.s.starts_with("/*!") {
738        let eol = pt.s[3..].find("*/").map(|x| 3 + x + 2).unwrap_or_else(|| pt.s.len());
739        let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
740        Progress::success(pt, Token::DocCommentInnerBlock(ex(spt, pt)))
741    } else if pt.s.starts_with("/*") {
742        let eol = pt.s[2..].find("*/").map(|x| 2 + x + 2).unwrap_or_else(|| pt.s.len());
743        let (pt, _) = try_parse!(spt.consume_to(Some(eol)).map_err(|_| Error::ExpectedComment));
744        Progress::success(pt, Token::CommentBlock(ex(spt, pt)))
745    } else {
746        Progress::failure(pt, Error::ExpectedComment)
747    }
748}
749
750fn character<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
751    sequence!(pm, pt, {
752        spt = point;
753        _   = literal("'");
754        _   = character_char;
755        _   = literal("'");
756    }, |_, pt| ex(spt, pt))
757}
758
759fn character_char<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
760    pm.alternate(pt)
761        .one(escaped_char)
762        .one(single_char)
763        .finish()
764}
765
766fn escaped_char<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
767    sequence!(pm, pt, {
768        spt = point;
769        _   = literal("\\");
770        _   = escaped_char_code;
771    }, |_, pt| spt.to(pt))
772}
773
774fn escaped_char_code<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
775    pm.alternate(pt)
776        .one(literal("n"))
777        .one(literal("r"))
778        .one(literal("t"))
779        .one(literal("\\"))
780        .one(literal("'"))
781        .one(literal("\""))
782        .one(literal("0"))
783        .one(escaped_char_hex)
784        .one(escaped_char_unicode)
785        .finish()
786}
787
788fn escaped_char_hex<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
789    sequence!(pm, pt, {
790        spt = point;
791        _   = literal("x");
792        _   = hex_string;
793    }, |_, pt| ex(spt, pt))
794}
795
796fn escaped_char_unicode<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
797    sequence!(pm, pt, {
798        spt = point;
799        _   = literal("u{");
800        _   = hex_string;
801        _   = literal("}");
802    }, |_, pt| ex(spt, pt))
803}
804
805fn hex_string<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
806    let ci = pt.s.chars();
807    let idx = ci.take_while(|c| c.is_digit(16)).map(|c| c.len_utf8()).sum();
808
809    let idx = if idx == 0 { None } else { Some(idx) };
810    pt.consume_to(idx).map_err(|_| Error::ExpectedHex)
811}
812
813fn single_char<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
814    match pt.s.char_indices().next() {
815        Some((_, c)) => {
816            let i = c.len_utf8();
817            let (head, tail) = pt.s.split_at(i);
818            let pt = Point { s: tail, offset: pt.offset + i };
819            Progress::success(pt, head)
820        }
821        None => {
822            Progress::failure(pt, Error::ExpectedCharacter)
823        }
824    }
825}
826
827fn string<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
828    sequence!(pm, pt, {
829        spt = point;
830        _   = literal("\"");
831        _   = string_char;
832        _   = literal("\"");
833    }, |_, pt| ex(spt, pt))
834}
835
836fn string_char<'s>(_pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, &'s str> {
837    let res = |i| {
838        let (head, tail) = pt.s.split_at(i);
839        let pt = Point { s: tail, offset: pt.offset + i };
840        Progress::success(pt, head)
841    };
842
843    let mut escaped = false;
844    for (i, c) in pt.s.char_indices() {
845        match (escaped, c) {
846            (true, _) => escaped = false,
847            (false, '\\') => escaped = true,
848            (false, '"') => return res(i),
849            (false, _) => { /* Next char */ },
850        }
851    }
852
853    res(pt.s.len())
854}
855
856fn string_raw<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
857    sequence!(pm, pt, {
858        spt = point;
859        _   = literal("r");
860        h   = zero_or_more(literal("#"));
861        _   = literal(r#"""#);
862        _   = raw_string_tail(h.len());
863    }, |_, pt| ex(spt, pt))
864}
865
866fn raw_string_tail<'s>(hashes: usize) -> impl Fn(&mut Master<'s>, Point<'s>) ->
867    Progress<'s, &'s str>
868{
869    let mut s = r#"""#.to_string();
870    for _ in 0..hashes { s.push('#') };
871
872    move |_, pt| {
873        match pt.s.find(&s) {
874            Some(end) => {
875                let (str_content, quote_tail) = pt.s.split_at(end);
876                let (_quotes, tail) = quote_tail.split_at(s.len());
877                let pt = Point { s: tail, offset: pt.offset + end + s.len() };
878                Progress::success(pt, str_content)
879            }
880            None => {
881                Progress::failure(pt, Error::UnterminatedRawString)
882            }
883        }
884    }
885}
886
887fn byte<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
888    sequence!(pm, pt, {
889        spt = point;
890        _   = literal("b");
891        _   = character;
892    }, |_, pt| ex(spt, pt))
893}
894
895fn byte_string<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
896    sequence!(pm, pt, {
897        spt = point;
898        _   = literal("b");
899        _   = string;
900    }, |_, pt| ex(spt, pt))
901}
902
903fn byte_string_raw<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
904    sequence!(pm, pt, {
905        spt = point;
906        _   = literal("b");
907        _   = string_raw;
908    }, |_, pt| ex(spt, pt))
909}
910
911fn lifetime<'s>(pm: &mut Master<'s>, pt: Point<'s>) -> Progress<'s, Extent> {
912    sequence!(pm, pt, {
913        spt = point;
914        _   = literal("'");
915        _   = simple_ident;
916    }, |_, pt| ex(spt, pt))
917}
918
919fn literal<'s>(expected: &'static str) ->
920    impl Fn(&mut Master<'s>, Point<'s>) -> Progress<'s, Extent>
921{
922    move |_, spt| {
923        let (pt, _) = try_parse!(spt.consume_literal(expected).map_err(|_| Error::Literal(expected)));
924        Progress::success(pt, ex(spt, pt))
925    }
926}
927
928fn ex(start: Point, end: Point) -> Extent {
929    let ex = Extent(start.offset, end.offset);
930    assert!(ex.1 >= ex.0, "{} does not come before {}", ex.1, ex.0);
931    ex
932}
933
934fn split_point_at_non_zero_offset(pt: Point<'_>, idx: usize, e: Error) ->
935    Progress<'_, (&'_ str, Extent)>
936{
937    if idx == 0 {
938        peresil::Progress::failure(pt, e)
939    } else {
940        let (matched, tail) = pt.s.split_at(idx);
941        let end = pt.offset + idx;
942        let end_pt = Point { s: tail, offset: end };
943
944        peresil::Progress::success(end_pt, (matched, Extent(pt.offset, end)))
945    }
946}
947
948#[cfg(test)]
949mod test {
950    use super::*;
951
952    macro_rules! tokenize_as {
953        ($input:expr, $p:path) => ({
954            let toks = tok($input);
955            unwrap_as!(toks[0], $p)
956        })
957    }
958
959    fn tok(s: &str) -> Vec<Token> {
960        tok_full(s).expect("Tokenization failed")
961    }
962
963    fn tok_full(s: &str) -> Result<Vec<Token>, ErrorDetail> {
964        Tokens::new(s).collect()
965    }
966
967    #[test]
968    fn keyword_is_not_an_ident() {
969        let s = tokenize_as!("for", Token::For);
970        assert_eq!(s, (0, 3))
971    }
972
973    #[test]
974    fn ident_can_have_keyword_substring() {
975        let s = tokenize_as!("form", Token::Ident);
976        assert_eq!(s, (0, 4))
977    }
978
979    #[test]
980    fn raw_idents_can_be_keywords() {
981        let s = tokenize_as!("r#for", Token::Ident);
982        assert_eq!(s, (0, 5))
983    }
984
985    #[test]
986    fn raw_idents_require_some_identifier() {
987        let tokens = tok_full("r#").unwrap_err();
988        assert!(tokens
989            .errors
990            .contains(&Error::RawIdentifierMissingIdentifier));
991    }
992
993    #[test]
994    fn character() {
995        let s = tokenize_as!("'a'", Token::Character);
996        assert_eq!(s, (0, 3));
997    }
998
999    #[test]
1000    fn character_escaped() {
1001        let s = tokenize_as!(r#"'\\'"#, Token::Character);
1002        assert_eq!(s, (0, 4));
1003    }
1004
1005    #[test]
1006    fn character_escaped_hex() {
1007        let s = tokenize_as!(r#"'\x41'"#, Token::Character);
1008        assert_eq!(s, (0, 6));
1009    }
1010
1011    #[test]
1012    fn character_escaped_unicode() {
1013        let s = tokenize_as!(r#"'\u{1F63B}'"#, Token::Character);
1014        assert_eq!(s, (0, 11));
1015    }
1016
1017    #[test]
1018    fn character_limited_to_single() {
1019        let toks = tok("impl<'a> Foo<'a> for Bar<'a> { }");
1020
1021        let s = unwrap_as!(toks[2], Token::Lifetime);
1022        assert_eq!(s, (5, 7));
1023
1024        let s = unwrap_as!(toks[7], Token::Lifetime);
1025        assert_eq!(s, (13, 15));
1026
1027        let s = unwrap_as!(toks[14], Token::Lifetime);
1028        assert_eq!(s, (25, 27));
1029    }
1030
1031    #[test]
1032    fn string_raw() {
1033        let s = tokenize_as!(r###"r#"inner"#"###, Token::StringRaw);
1034        assert_eq!(s, (0, 10));
1035    }
1036
1037    #[test]
1038    fn byte() {
1039        let s = tokenize_as!(r#"b'a'"#, Token::Byte);
1040        assert_eq!(s, (0, 4));
1041    }
1042
1043    #[test]
1044    fn byte_string() {
1045        let s = tokenize_as!(r#"b"abc""#, Token::ByteString);
1046        assert_eq!(s, (0, 6));
1047    }
1048
1049    #[test]
1050    fn byte_string_raw() {
1051        let s = tokenize_as!(r#"br"abc""#, Token::ByteStringRaw);
1052        assert_eq!(s, (0, 7));
1053    }
1054
1055    #[test]
1056    fn tilde_is_a_token_even_though_unused() {
1057        let s = tokenize_as!("~", Token::Tilde);
1058        assert_eq!(s, (0, 1));
1059    }
1060
1061    #[test]
1062    fn number_binary() {
1063        let s = tokenize_as!("0b0101", Token::Number);
1064        assert_eq!(s.extent(), (0, 6));
1065        let n = unwrap_as!(s, Number::Binary);
1066        assert_eq!(n.integral, (2, 6));
1067    }
1068
1069    #[test]
1070    fn number_decimal() {
1071        let s = tokenize_as!("123456", Token::Number);
1072        assert_eq!(s.extent(), (0, 6));
1073        let n = unwrap_as!(s, Number::Decimal);
1074        assert_eq!(n.integral, (0, 6));
1075        let n = s.into_simple();
1076        assert_eq!(n, Some(Extent(0, 6)));
1077    }
1078
1079    #[test]
1080    fn number_hexadecimal() {
1081        let s = tokenize_as!("0xBeeF", Token::Number);
1082        assert_eq!(s.extent(), (0, 6));
1083        let n = unwrap_as!(s, Number::Hexadecimal);
1084        assert_eq!(n.integral, (2, 6));
1085    }
1086
1087    #[test]
1088    fn number_octal() {
1089        let s = tokenize_as!("0o0777", Token::Number);
1090        assert_eq!(s.extent(), (0, 6));
1091        let n = unwrap_as!(s, Number::Octal);
1092        assert_eq!(n.integral, (2, 6));
1093    }
1094
1095    #[test]
1096    fn number_decimal_with_decimal() {
1097        let s = tokenize_as!("0.", Token::Number);
1098        assert_eq!(s.extent(), (0, 2));
1099        let n = unwrap_as!(s, Number::Decimal);
1100        assert_eq!(n.integral, (0, 1));
1101        assert_eq!(n.fractional, Some(Extent(1, 2)));
1102    }
1103
1104    #[test]
1105    fn number_with_decimal() {
1106        let s = tokenize_as!("0xA.", Token::Number);
1107        assert_eq!(s.extent(), (0, 4));
1108        let n = unwrap_as!(s, Number::Hexadecimal);
1109        assert_eq!(n.integral, (2, 3));
1110        assert_eq!(n.fractional, Some(Extent(3, 4)));
1111    }
1112
1113    #[test]
1114    fn number_with_fractional_part() {
1115        let s = tokenize_as!("0b01.10", Token::Number);
1116        assert_eq!(s.extent(), (0, 7));
1117        let n = unwrap_as!(s, Number::Binary);
1118        assert_eq!(n.integral, (2, 4));
1119        assert_eq!(n.fractional, Some(Extent(4, 7)));
1120    }
1121
1122    #[test]
1123    fn number_with_exponent() {
1124        let s = tokenize_as!("0b1000E7", Token::Number);
1125        assert_eq!(s.extent(), (0, 8));
1126        let n = unwrap_as!(s, Number::Binary);
1127        assert_eq!(n.integral, (2, 6));
1128        assert_eq!(n.exponent, Some(Extent(7, 8)));
1129    }
1130
1131    #[test]
1132    fn number_with_type_suffix() {
1133        let s = tokenize_as!("0o1234_usize", Token::Number);
1134        assert_eq!(s.extent(), (0, 12));
1135        let n = unwrap_as!(s, Number::Octal);
1136        assert_eq!(n.integral, (2, 7));
1137        assert_eq!(n.type_suffix, Some(Extent(7, 12)));
1138    }
1139
1140    #[test]
1141    fn number_with_spacers() {
1142        let s = tokenize_as!("0x0A_1b_2C_3d", Token::Number);
1143        assert_eq!(s.extent(), (0, 13));
1144        let n = unwrap_as!(s, Number::Hexadecimal);
1145        assert_eq!(n.integral, (2, 13));
1146    }
1147
1148    #[test]
1149    fn number_decimal_with_spacers() {
1150        let s = tokenize_as!("01_23", Token::Number);
1151        assert_eq!(s.extent(), (0, 5));
1152        let n = unwrap_as!(s, Number::Decimal);
1153        assert_eq!(n.integral, (0, 5));
1154    }
1155
1156    #[test]
1157    fn number_with_everything() {
1158        let s = tokenize_as!("0o__12__56__.43__e__32__my_type", Token::Number);
1159        assert_eq!(s.extent(), (0, 31));
1160        let n = unwrap_as!(s, Number::Octal);
1161        assert_eq!(n.integral, (2, 12));
1162        assert_eq!(n.fractional, Some(Extent(12, 17)));
1163        assert_eq!(n.exponent, Some(Extent(18, 24)));
1164        assert_eq!(n.type_suffix, Some(Extent(24, 31)));
1165    }
1166
1167    #[test]
1168    fn number_decimal_with_leading_spacer_is_an_ident() {
1169        let s = tokenize_as!("_42", Token::Ident);
1170        assert_eq!(s, (0, 3));
1171    }
1172
1173    #[test]
1174    fn number_followed_by_range_is_not_fractional() {
1175        let toks = tok("1..2");
1176
1177        let s = unwrap_as!(toks[0], Token::Number);
1178        assert_eq!(s.extent(), (0, 1));
1179
1180        let s = unwrap_as!(toks[1], Token::DoublePeriod);
1181        assert_eq!(s, (1, 3));
1182
1183        let s = unwrap_as!(toks[2], Token::Number);
1184        assert_eq!(s.extent(), (3, 4));
1185    }
1186
1187    #[test]
1188    fn number_followed_by_ident_is_not_fractional() {
1189        let toks = tok("1.foo");
1190
1191        let s = unwrap_as!(toks[0], Token::Number);
1192        assert_eq!(s.extent(), (0, 1));
1193
1194        let s = unwrap_as!(toks[1], Token::Period);
1195        assert_eq!(s, (1, 2));
1196
1197        let s = unwrap_as!(toks[2], Token::Ident);
1198        assert_eq!(s, (2, 5));
1199    }
1200
1201    #[test]
1202    fn whitespace_unicode_direction_markers() {
1203        let s = tokenize_as!("\u{200e}\u{200f}", Token::Whitespace);
1204        assert_eq!(s, (0, 6))
1205    }
1206
1207    #[test]
1208    fn comment_block() {
1209        let s = tokenize_as!("/* hi */", Token::CommentBlock);
1210        assert_eq!(s, (0, 8))
1211    }
1212
1213    #[test]
1214    fn comment_block_not_immediately_closed() {
1215        let s = tokenize_as!("/*/ */", Token::CommentBlock);
1216        assert_eq!(s, (0, 6))
1217    }
1218
1219    #[test]
1220    fn comment_block_immediately_closed() {
1221        let s = tokenize_as!("/**/", Token::CommentBlock);
1222        assert_eq!(s, (0, 4))
1223    }
1224
1225    #[test]
1226    fn doc_comment_outer_block() {
1227        let s = tokenize_as!("/** hi */", Token::DocCommentOuterBlock);
1228        assert_eq!(s, (0, 9))
1229    }
1230
1231    #[test]
1232    fn doc_comment_inner_block() {
1233        let s = tokenize_as!("/*! hi */", Token::DocCommentInnerBlock);
1234        assert_eq!(s, (0, 9))
1235    }
1236
1237    #[test]
1238    fn doc_comment_outer_line() {
1239        let s = tokenize_as!("/// hi", Token::DocCommentOuterLine);
1240        assert_eq!(s, (0, 6))
1241    }
1242
1243    #[test]
1244    fn doc_comment_inner_line() {
1245        let s = tokenize_as!("//! hi", Token::DocCommentInnerLine);
1246        assert_eq!(s, (0, 6))
1247    }
1248
1249    #[test]
1250    fn end_of_file() {
1251        let s = tokenize_as!("", Token::EndOfFile);
1252        assert_eq!(s, (0, 0))
1253    }
1254}