1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
use lazy_static::lazy_static;
use regex::Regex;

pub mod internal;

#[macro_export]
macro_rules! regexify {
    ($regex:expr) => {
        Regex::new($regex).expect("Invalid regex.")
    };
}

lazy_static! {
    static ref MERGED: Regex = Regex::new(r"(^\s+)|(^/\*(.|\n)*?\*/)|(^//(.*)\n)").unwrap();
}

pub struct Tokenizer<'a> {
    source: &'a str,
    duos: &'static [(&'static str, regex::Regex)],
    cursor: usize,
    line: usize,
    column: usize,
}

#[derive(Debug, Clone)]
pub struct Token<'a> {
    pub kind: &'a str,
    pub value: &'a str,
    pub pos: (usize, usize),
}

pub enum TokenizerResult<'a> {
    Found(Token<'a>),
    Error(usize, usize),
    End,
}

impl<'a> Tokenizer<'a> {
    #[inline]
    pub fn new(source: &'a str, duos: &'static [(&'static str, regex::Regex)]) -> Self {
        Self {
            source,
            duos,
            cursor: 0,
            line: 1,
            column: 1,
        }
    }

    #[inline]
    fn is_eof(&self) -> bool {
        self.cursor >= self.source.len()
    }

    pub fn next(&mut self) -> TokenizerResult<'a> {
        if self.is_eof() {
            return TokenizerResult::End;
        }

        loop {
            if let Some(result) = MERGED.find(&self.source[self.cursor..]) {
                let len = result.len();
                self.cursor += len;

                let value: &str = result.as_str();
                let newlines_count = value.chars().filter(|&c| c == '\n').count();
                if newlines_count > 0 {
                    self.line += newlines_count;
                    let distance_newline = value.rfind('\n').unwrap_or(0);
                    self.column = len - distance_newline;
                } else {
                    self.column += len;
                }

                if self.is_eof() {
                    return TokenizerResult::End;
                }
            } else {
                break;
            }
        }

        for (kind, regex) in self.duos.iter() {
            if let Some(result) = regex.find(&self.source[self.cursor..]) {
                let len = result.len();
                self.cursor += len;

                let token = TokenizerResult::Found(Token {
                    kind,
                    value: result.as_str(),
                    pos: (self.line, self.column),
                });

                self.column += len;

                return token;
            }
        }

        TokenizerResult::Error(self.line, self.column)
    }
}