ucglib/tokenizer/
mod.rs

1// Copyright 2017 Jeremy Wall <jeremy@marzhillstudios.com>
2//
3//  Licensed under the Apache License, Version 2.0 (the "License");
4//  you may not use this file except in compliance with the License.
5//  You may obtain a copy of the License at
6//
7//      http://www.apache.org/licenses/LICENSE-2.0
8//
9//  Unless required by applicable law or agreed to in writing, software
10//  distributed under the License is distributed on an "AS IS" BASIS,
11//  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12//  See the License for the specific language governing permissions and
13//  limitations under the License.
14
15//! The tokenization stage of the ucg compiler.
16use std;
17
18use abortable_parser::combinators::*;
19use abortable_parser::iter::SliceIter;
20use abortable_parser::{Error, Result};
21
22use crate::ast::*;
23use crate::error::BuildError;
24use crate::iter::OffsetStrIter;
25
26pub type CommentGroup = Vec<Token>;
27pub type CommentMap = std::collections::BTreeMap<usize, CommentGroup>;
28
29fn is_symbol_char<'a>(i: OffsetStrIter<'a>) -> Result<OffsetStrIter<'a>, u8> {
30    let mut _i = i.clone();
31    let c = match _i.next() {
32        Some(c) => *c,
33        None => {
34            return Result::Fail(Error::new(
35                "Unexpected End of Input".to_string(),
36                Box::new(_i.clone()),
37            ));
38        }
39    };
40    if (c as char).is_ascii_alphanumeric() || c == b'-' || c == b'_' {
41        Result::Complete(_i, c)
42    } else {
43        Result::Fail(Error::new(
44            "Not a symbol character".to_string(),
45            Box::new(_i.clone()),
46        ))
47    }
48}
49
50fn escapequoted<'a>(input: OffsetStrIter<'a>) -> Result<OffsetStrIter<'a>, String> {
51    // loop until we find a " that is not preceded by \.
52    // Collapse all \<char> to just char  for escaping exept for \n \r \t and \@.
53    let mut frag = String::new();
54    let mut escape = false;
55    let mut _input = input.clone();
56    loop {
57        let c = match _input.next() {
58            Some(c) => *c,
59            None => break,
60        };
61        if escape {
62            match c as char {
63                'n' => {
64                    frag.push('\n');
65                    escape = false;
66                    continue;
67                }
68                'r' => {
69                    frag.push('\r');
70                    escape = false;
71                    continue;
72                }
73                't' => {
74                    frag.push('\t');
75                    escape = false;
76                    continue;
77                }
78                _ => {
79                    //noop
80                }
81            }
82        }
83        if c == '\\' as u8 && !escape {
84            // eat this slash and set our escaping sentinel
85            escape = true;
86        } else if c == '"' as u8 && !escape {
87            // Bail if this is an unescaped "
88            // we exit here.
89            return Result::Complete(_input, frag);
90        } else {
91            // we accumulate this character.
92            frag.push(c as char);
93            escape = false; // reset our escaping sentinel
94        }
95    }
96    return Result::Incomplete(_input.clone());
97}
98
99make_fn!(strtok<OffsetStrIter, Token>,
100       do_each!(
101           span => input!(),
102           _    => text_token!("\""),
103           frag => escapequoted,
104           (Token{
105               typ: TokenType::QUOTED,
106               pos: Position::from(&span),
107               fragment: frag.to_string(),
108           })
109       )
110);
111
112make_fn!(barewordtok<OffsetStrIter, Token>,
113       do_each!(
114           span => input!(),
115           _    => peek!(ascii_alpha),
116           frag => consume_all!(is_symbol_char),
117           (Token{
118               typ: TokenType::BAREWORD,
119               pos: Position::from(&span),
120               fragment: frag.to_string(),
121           })
122       )
123);
124
125make_fn!(digittok<OffsetStrIter, Token>,
126       do_each!(
127           span => input!(),
128           _ => peek!(ascii_digit),
129           digits => consume_all!(ascii_digit),
130           (Token{
131               typ: TokenType::DIGIT,
132               pos: Position::from(&span),
133               fragment: digits.to_string(),
134           })
135       )
136);
137
138make_fn!(booleantok<OffsetStrIter, Token>,
139    do_each!(
140        span => input!(),
141        token => either!(
142            text_token!("true"),
143            text_token!("false")
144        ),
145        (Token{
146            typ: TokenType::BOOLEAN,
147            pos: Position::from(&span),
148            fragment: token.to_string(),
149        })
150    )
151);
152
153/// do_text_token_tok! is a helper macro to make building a simple text_token token
154/// less code.
155macro_rules! do_text_token_tok {
156    ($i:expr, $type:expr, $text_token:expr, WS) => {
157        do_each!($i,
158           span => input!(),
159           frag => text_token!($text_token),
160           _ => either!(whitespace, comment),
161           (Token {
162               typ: $type,
163               pos: Position::from(&span),
164               fragment: frag.to_string(),
165           })
166           )
167    };
168
169    ($i:expr, $type:expr, $text_token:expr) => {
170        do_each!($i,
171            span => input!(),
172            frag => text_token!($text_token),
173            (Token {
174                typ: $type,
175                pos: Position::from(&span),
176                fragment: frag.to_string(),
177            })
178            )
179    };
180}
181
182make_fn!(emptytok<OffsetStrIter, Token>,
183       do_text_token_tok!(TokenType::EMPTY, "NULL")
184);
185
186make_fn!(commatok<OffsetStrIter, Token>,
187       do_text_token_tok!(TokenType::PUNCT, ",")
188);
189
190make_fn!(lbracetok<OffsetStrIter, Token>,
191       do_text_token_tok!(TokenType::PUNCT, "{")
192);
193
194make_fn!(rbracetok<OffsetStrIter, Token>,
195       do_text_token_tok!(TokenType::PUNCT, "}")
196);
197
198make_fn!(lparentok<OffsetStrIter, Token>,
199       do_text_token_tok!(TokenType::PUNCT, "(")
200);
201
202make_fn!(rparentok<OffsetStrIter, Token>,
203       do_text_token_tok!(TokenType::PUNCT, ")")
204);
205
206make_fn!(dottok<OffsetStrIter, Token>,
207       do_text_token_tok!(TokenType::PUNCT, ".")
208);
209
210make_fn!(plustok<OffsetStrIter, Token>,
211       do_text_token_tok!(TokenType::PUNCT, "+")
212);
213
214make_fn!(dashtok<OffsetStrIter, Token>,
215       do_text_token_tok!(TokenType::PUNCT, "-")
216);
217
218make_fn!(startok<OffsetStrIter, Token>,
219       do_text_token_tok!(TokenType::PUNCT, "*")
220);
221
222make_fn!(slashtok<OffsetStrIter, Token>,
223       do_text_token_tok!(TokenType::PUNCT, "/")
224);
225
226make_fn!(modulustok<OffsetStrIter, Token>,
227       do_text_token_tok!(TokenType::PUNCT, "%%")
228);
229
230make_fn!(pcttok<OffsetStrIter, Token>,
231       do_text_token_tok!(TokenType::PUNCT, "%")
232);
233
234make_fn!(eqeqtok<OffsetStrIter, Token>,
235       do_text_token_tok!(TokenType::PUNCT, "==")
236);
237
238make_fn!(notequaltok<OffsetStrIter, Token>,
239       do_text_token_tok!(TokenType::PUNCT, "!=")
240);
241
242make_fn!(matchtok<OffsetStrIter, Token>,
243       do_text_token_tok!(TokenType::PUNCT, "~")
244);
245
246make_fn!(notmatchtok<OffsetStrIter, Token>,
247       do_text_token_tok!(TokenType::PUNCT, "!~")
248);
249
250make_fn!(gttok<OffsetStrIter, Token>,
251       do_text_token_tok!(TokenType::PUNCT, ">")
252);
253
254make_fn!(gtequaltok<OffsetStrIter, Token>,
255       do_text_token_tok!(TokenType::PUNCT, ">=")
256);
257
258make_fn!(ltequaltok<OffsetStrIter, Token>,
259       do_text_token_tok!(TokenType::PUNCT, "<=")
260);
261
262make_fn!(lttok<OffsetStrIter, Token>,
263       do_text_token_tok!(TokenType::PUNCT, "<")
264);
265
266make_fn!(equaltok<OffsetStrIter, Token>,
267       do_text_token_tok!(TokenType::PUNCT, "=")
268);
269
270make_fn!(semicolontok<OffsetStrIter, Token>,
271       do_text_token_tok!(TokenType::PUNCT, ";")
272);
273
274make_fn!(doublecolontok<OffsetStrIter, Token>,
275       do_text_token_tok!(TokenType::PUNCT, "::")
276);
277
278make_fn!(colontok<OffsetStrIter, Token>,
279       do_text_token_tok!(TokenType::PUNCT, ":")
280);
281
282make_fn!(leftsquarebracket<OffsetStrIter, Token>,
283    do_text_token_tok!(TokenType::PUNCT, "[")
284);
285
286make_fn!(rightsquarebracket<OffsetStrIter, Token>,
287    do_text_token_tok!(TokenType::PUNCT, "]")
288);
289
290make_fn!(fatcommatok<OffsetStrIter, Token>,
291       do_text_token_tok!(TokenType::PUNCT, "=>")
292);
293
294make_fn!(andtok<OffsetStrIter, Token>,
295       do_text_token_tok!(TokenType::PUNCT, "&&")
296);
297
298make_fn!(ortok<OffsetStrIter, Token>,
299       do_text_token_tok!(TokenType::PUNCT, "||")
300);
301
302make_fn!(selecttok<OffsetStrIter, Token>,
303       do_text_token_tok!(TokenType::BAREWORD, "select", WS)
304);
305
306make_fn!(intok<OffsetStrIter, Token>,
307       do_text_token_tok!(TokenType::BAREWORD, "in", WS)
308);
309
310make_fn!(istok<OffsetStrIter, Token>,
311       do_text_token_tok!(TokenType::BAREWORD, "is", WS)
312);
313
314make_fn!(nottok<OffsetStrIter, Token>,
315       do_text_token_tok!(TokenType::BAREWORD, "not", WS)
316);
317
318make_fn!(tracetok<OffsetStrIter, Token>,
319       do_text_token_tok!(TokenType::BAREWORD, "TRACE", WS)
320);
321
322make_fn!(failtok<OffsetStrIter, Token>,
323       do_text_token_tok!(TokenType::BAREWORD, "fail", WS)
324);
325
326make_fn!(functok<OffsetStrIter, Token>,
327       do_text_token_tok!(TokenType::BAREWORD, "func", WS)
328);
329
330make_fn!(moduletok<OffsetStrIter, Token>,
331       do_text_token_tok!(TokenType::BAREWORD, "module", WS)
332);
333
334make_fn!(lettok<OffsetStrIter, Token>,
335       do_text_token_tok!(TokenType::BAREWORD, "let", WS)
336);
337
338make_fn!(importtok<OffsetStrIter, Token>,
339       do_text_token_tok!(TokenType::BAREWORD, "import", WS)
340);
341
342make_fn!(includetok<OffsetStrIter, Token>,
343       do_text_token_tok!(TokenType::BAREWORD, "include", WS)
344);
345
346make_fn!(asserttok<OffsetStrIter, Token>,
347       do_text_token_tok!(TokenType::BAREWORD, "assert", WS)
348);
349
350make_fn!(outtok<OffsetStrIter, Token>,
351       do_text_token_tok!(TokenType::BAREWORD, "out", WS)
352);
353
354make_fn!(converttok<OffsetStrIter, Token>,
355       do_text_token_tok!(TokenType::BAREWORD, "convert", WS)
356);
357
358make_fn!(astok<OffsetStrIter, Token>,
359       do_text_token_tok!(TokenType::BAREWORD, "as", WS)
360);
361
362make_fn!(maptok<OffsetStrIter, Token>,
363       do_text_token_tok!(TokenType::BAREWORD, "map", WS)
364);
365
366make_fn!(filtertok<OffsetStrIter, Token>,
367       do_text_token_tok!(TokenType::BAREWORD, "filter", WS)
368);
369
370make_fn!(reducetok<OffsetStrIter, Token>,
371       do_text_token_tok!(TokenType::BAREWORD, "reduce", WS)
372);
373
374fn comment(input: OffsetStrIter) -> Result<OffsetStrIter, Token> {
375    match text_token!(input, "//") {
376        Result::Complete(rest, _) => {
377            match until!(
378                rest,
379                either!(
380                    eoi,
381                    discard!(text_token!("\r\n")),
382                    discard!(text_token!("\n"))
383                )
384            ) {
385                Result::Complete(rest, cmt) => {
386                    // Eat the new lines here before continuing
387                    let rest =
388                        match optional!(rest, either!(text_token!("\r\n"), text_token!("\n"))) {
389                            Result::Complete(next_rest, _) => next_rest,
390                            _ => rest,
391                        };
392                    return Result::Complete(rest, make_tok!(CMT => cmt.to_string(), input));
393                }
394                // If we didn't find a new line then we just grab everything.
395                _ => {
396                    return Result::Abort(Error::new(
397                        "Unparsable comment".to_string(),
398                        Box::new(rest.clone()),
399                    ));
400                }
401            }
402        }
403        Result::Incomplete(ctx) => return Result::Incomplete(ctx),
404        Result::Fail(e) => return Result::Fail(e),
405        Result::Abort(e) => return Result::Abort(e),
406    }
407}
408
409make_fn!(whitespace<OffsetStrIter, Token>,
410    do_each!(
411        span => input!(),
412        _ => peek!(ascii_ws),
413        _ => repeat!(ascii_ws),
414         (Token{
415            typ: TokenType::WS,
416            pos: Position::from(&span),
417            fragment: String::new(),
418         })
419    )
420);
421
422make_fn!(end_of_input<OffsetStrIter, Token>,
423    do_each!(
424        span => input!(),
425        _ => eoi,
426        (Token{
427            typ: TokenType::END,
428            pos: Position::from(&span),
429            fragment: String::new(),
430        })
431    )
432);
433
434fn token<'a>(input: OffsetStrIter<'a>) -> Result<OffsetStrIter<'a>, Token> {
435    either!(
436        input,
437        strtok,
438        emptytok, // This must come before the barewordtok
439        digittok,
440        commatok,
441        rbracetok,
442        lbracetok,
443        lparentok,
444        rparentok,
445        dottok,
446        andtok,
447        ortok,
448        plustok,
449        dashtok,
450        startok,
451        comment, // Note comment must come before slashtok
452        slashtok,
453        modulustok,
454        pcttok,
455        eqeqtok,
456        notequaltok,
457        matchtok,
458        notmatchtok,
459        complete!("Not >=".to_string(), gtequaltok),
460        complete!("Not <=".to_string(), ltequaltok),
461        gttok,
462        lttok,
463        fatcommatok, // Note fatcommatok must come before equaltok
464        equaltok,
465        semicolontok,
466        doublecolontok,
467        colontok,
468        leftsquarebracket,
469        rightsquarebracket,
470        booleantok,
471        intok,
472        istok,
473        nottok,
474        lettok,
475        outtok,
476        converttok,
477        selecttok,
478        asserttok,
479        failtok,
480        tracetok,
481        functok,
482        moduletok,
483        importtok,
484        includetok,
485        astok,
486        maptok,
487        filtertok,
488        reducetok,
489        barewordtok,
490        whitespace,
491        end_of_input
492    )
493}
494
495/// Consumes an input OffsetStrIter and returns either a Vec<Token> or a error::Error.
496/// If a comment_map is passed in then it will store the comments indexed by their
497/// line number.
498pub fn tokenize<'a>(
499    input: OffsetStrIter<'a>,
500    mut comment_map: Option<&mut CommentMap>,
501) -> std::result::Result<Vec<Token>, BuildError> {
502    let mut out = Vec::new();
503    let mut i = input.clone();
504    let mut comment_group = Vec::new();
505    let mut comment_was_last: Option<Token> = None;
506    loop {
507        if let Result::Complete(_, _) = eoi(i.clone()) {
508            break;
509        }
510        match token(i.clone()) {
511            Result::Abort(e) => {
512                return Err(BuildError::from(e));
513            }
514            Result::Fail(e) => {
515                return Err(BuildError::from(e));
516            }
517            Result::Incomplete(_offset) => {
518                let err =
519                    abortable_parser::Error::new("Invalid Token encountered", Box::new(i.clone()));
520                return Err(BuildError::from(err));
521            }
522            Result::Complete(rest, tok) => {
523                i = rest;
524                match (&mut comment_map, &tok.typ) {
525                    // variants with a comment_map
526                    (&mut Some(_), &TokenType::COMMENT) => {
527                        comment_group.push(tok.clone());
528                        comment_was_last = Some(tok.clone());
529                        continue;
530                    }
531                    (&mut Some(ref mut map), _) => {
532                        if tok.typ != TokenType::WS {
533                            out.push(tok);
534                        }
535                        if let Some(tok) = comment_was_last {
536                            map.insert(tok.pos.line, comment_group);
537                            comment_group = Vec::new();
538                        }
539                    }
540                    // variants without a comment_map
541                    (None, TokenType::WS) | (None, TokenType::COMMENT) => continue,
542                    (None, _) => {
543                        out.push(tok);
544                    }
545                }
546                comment_was_last = None;
547            }
548        }
549    }
550    // if we had a comments at the end then we need to do a final
551    // insert into our map.
552    if let Some(ref mut map) = comment_map {
553        if let Some(ref tok) = comment_group.last() {
554            let line = tok.pos.line;
555            map.insert(line, comment_group);
556        }
557    }
558    // ensure that we always have an END token to go off of.
559    out.push(Token {
560        fragment: String::new(),
561        typ: TokenType::END,
562        pos: Position::from(&i),
563    });
564    Ok(out)
565}
566
567/// Clones a token.
568///
569/// This is necessary to allow the match_type and match_token macros to work.
570pub fn token_clone(t: &Token) -> std::result::Result<Token, Error<SliceIter<Token>>> {
571    Ok(t.clone())
572}
573
574/// nom macro that matches a Token by type and uses an optional conversion handler
575/// for the matched Token.
576macro_rules! match_type {
577    ($i:expr,BOOLEAN => $h:expr) => {
578        match_type!($i, TokenType::BOOLEAN, "Not a Boolean", $h)
579    };
580
581    ($i:expr,BOOLEAN) => {
582        match_type!($i, BOOLEAN => token_clone)
583    };
584
585    ($i:expr,COMMENT => $h:expr) => {
586        match_type!($i, TokenType::COMMENT, "Not a Comment", $h)
587    };
588
589    ($i:expr,COMMENT) => {
590        match_type!($i, COMMENT => token_clone)
591    };
592
593    ($i:expr,BAREWORD => $h:expr) => {
594        match_type!($i, TokenType::BAREWORD, "Not a Bareword", $h)
595    };
596
597    ($i:expr,BAREWORD) => {
598        match_type!($i, BAREWORD => token_clone)
599    };
600
601    ($i:expr,EMPTY => $h:expr) => {
602        match_type!($i, TokenType::EMPTY, "Not NULL", $h)
603    };
604
605    ($i:expr,EMPTY) => {
606        match_type!($i, EMPTY => token_clone)
607    };
608
609    ($i:expr,STR => $h:expr) => {
610        match_type!($i, TokenType::QUOTED, "Not a String", $h)
611    };
612
613    ($i:expr,STR) => {
614        match_type!($i, STR => token_clone)
615    };
616
617    ($i:expr,DIGIT => $h:expr) => {
618        match_type!($i, TokenType::DIGIT, "Not a DIGIT", $h)
619    };
620
621    ($i:expr,DIGIT) => {
622        match_type!($i, DIGIT => token_clone)
623    };
624
625    ($i:expr,PUNCT => $h:expr) => {
626        match_type!($i, TokenType::PUNCT, "Not PUNCTUATION", $h)
627    };
628
629    ($i:expr,PUNCT) => {
630        match_type!($i, PUNCT => token_clone)
631    };
632
633    ($i:expr, $t:expr, $msg:expr, $h:expr) => {{
634        use abortable_parser::combinators::eoi;
635        use abortable_parser::{Error, Result};
636        use std;
637
638        let mut _i = $i.clone();
639        if eoi(_i.clone()).is_complete() {
640            Result::Fail(Error::new(format!("End of Input! {}", $msg), Box::new(_i)))
641        } else {
642            match _i.next() {
643                Some(tok) => {
644                    if tok.typ == $t {
645                        match $h(tok) {
646                            std::result::Result::Ok(v) => Result::Complete(_i.clone(), v),
647                            std::result::Result::Err(e) => {
648                                Result::Fail(Error::caused_by($msg, Box::new(e), Box::new(_i)))
649                            }
650                        }
651                    } else {
652                        Result::Fail(Error::new($msg.to_string(), Box::new($i)))
653                    }
654                }
655                None => Result::Fail(Error::new($msg.to_string(), Box::new($i))),
656            }
657        }
658    }};
659}
660
661/// nom style macro that matches various Tokens by type and value and allows optional
662/// conversion handlers for the matched Token.
663macro_rules! match_token {
664    ($i:expr,PUNCT => $f:expr) => {{
665        use crate::tokenizer::token_clone;
666        match_token!($i, PUNCT => $f, token_clone)
667    }};
668
669    ($i:expr,PUNCT => $f:expr, $h:expr) => {
670        match_token!($i, TokenType::PUNCT, $f, format!("({})", $f), $h)
671    };
672
673    ($i:expr,BAREWORD => $f:expr) => {{
674        use crate::tokenizer::token_clone;
675        match_token!($i, BAREWORD => $f, token_clone)
676    }};
677
678    ($i:expr,BAREWORD => $f:expr, $h:expr) => {
679        match_token!(
680            $i,
681            TokenType::BAREWORD,
682            $f,
683            format!("Expected BAREWORD but got ({})", $f),
684            $h
685        )
686    };
687
688    ($i:expr, $t:expr, $f:expr, $msg:expr, $h:expr) => {{
689        use abortable_parser::Result;
690        use std;
691        let mut i_ = $i.clone();
692        let tok = i_.next();
693        if let Some(tok) = tok {
694            if tok.typ == $t && &tok.fragment == $f {
695                match $h(tok) {
696                    std::result::Result::Ok(v) => Result::Complete(i_.clone(), v),
697                    std::result::Result::Err(e) => {
698                        Result::Fail(Error::caused_by($msg, Box::new(e), Box::new(i_)))
699                    }
700                }
701            } else {
702                Result::Fail(Error::new(
703                    format!("Expected {} but got ({})", $msg, tok.fragment),
704                    Box::new($i.clone()),
705                ))
706            }
707        } else {
708            Result::Fail(Error::new("Unexpected End Of Input", Box::new(i_)))
709        }
710    }};
711}
712
713/// nom style macro that matches punctuation Tokens.
714macro_rules! punct {
715    ($i:expr, $c:expr) => {
716        match_token!($i, PUNCT => $c)
717    };
718}
719
720/// nom style macro that matches any bareword Token.
721macro_rules! word {
722    ($i:expr, $w:expr) => {
723        match_token!($i, BAREWORD => $w)
724    };
725}
726
727/// pos gets the current position from a TokenIter input without consuming it.
728pub fn pos<'a>(i: SliceIter<'a, Token>) -> Result<SliceIter<'a, Token>, Position> {
729    let mut _i = i.clone();
730    let tok = _i.next().unwrap();
731    let pos = tok.pos.clone();
732    Result::Complete(i, pos)
733}
734
735#[cfg(test)]
736mod test;