1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
use std::path::Path;

use crate::{Position, PositionRange, Result, Token};

/// Tokenizer.
///
/// This is an iterator which tokenizes Erlang source code and iterates on the resulting tokens.
///
/// # Examples
///
/// ```
/// use erl_tokenize::Tokenizer;
///
/// let src = r#"io:format("Hello")."#;
/// let tokens = Tokenizer::new(src).collect::<Result<Vec<_>, _>>().unwrap();
///
/// assert_eq!(tokens.iter().map(|t| t.text()).collect::<Vec<_>>(),
///            ["io", ":", "format", "(", r#""Hello""#, ")", "."]);
/// ```
#[derive(Debug)]
pub struct Tokenizer<T> {
    text: T,
    next_pos: Position,
}
impl<T> Tokenizer<T>
where
    T: AsRef<str>,
{
    /// Makes a new `Tokenizer` instance which tokenize the Erlang source code text.
    pub fn new(text: T) -> Self {
        let init_pos = Position::new();
        Tokenizer {
            text,
            next_pos: init_pos,
        }
    }

    /// Sets the file path of the succeeding tokens.
    pub fn set_filepath<P: AsRef<Path>>(&mut self, filepath: P) {
        self.next_pos.set_filepath(filepath);
    }

    /// Returns the input text.
    pub fn text(&self) -> &str {
        self.text.as_ref()
    }

    /// Finishes tokenization and returns the target text.
    pub fn finish(self) -> T {
        self.text
    }

    /// Returns the cursor position from which this tokenizer will start to scan the next token.
    ///
    /// # Examples
    ///
    /// ```
    /// use erl_tokenize::Tokenizer;
    ///
    /// let src = r#"io:format(
    ///   "Hello")."#;
    ///
    /// let mut tokenizer = Tokenizer::new(src);
    /// assert_eq!(tokenizer.next_position().offset(), 0);
    ///
    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), "io");
    /// assert_eq!(tokenizer.next_position().offset(), 2);
    /// tokenizer.next(); // ':'
    /// tokenizer.next(); // 'format'
    /// tokenizer.next(); // '('
    /// tokenizer.next(); // '\n'
    /// assert_eq!(tokenizer.next_position().offset(), 11);
    /// assert_eq!(tokenizer.next_position().line(), 2);
    /// assert_eq!(tokenizer.next_position().column(), 1);
    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), " ");
    /// assert_eq!(tokenizer.next_position().offset(), 12);
    /// assert_eq!(tokenizer.next_position().line(), 2);
    /// assert_eq!(tokenizer.next_position().column(), 2);
    /// ```
    pub fn next_position(&self) -> Position {
        self.next_pos.clone()
    }

    /// Sets the current position.
    ///
    /// Note that it's the responsibility of the user to specify a valid position.
    /// Otherwise, the following tokenization process will raise an error.
    ///
    /// # Examples
    ///
    /// ```
    /// use erl_tokenize::Tokenizer;
    ///
    /// let src = r#"io:format(
    ///   "Hello")."#;
    ///
    /// let mut tokenizer = Tokenizer::new(src);
    /// assert_eq!(tokenizer.next_position().offset(), 0);
    ///
    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), "io");
    ///
    /// let position = tokenizer.next_position();
    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), ":");
    /// tokenizer.next(); // 'format'
    /// tokenizer.next(); // '('
    /// tokenizer.next(); // '\n'
    ///
    /// tokenizer.set_position(position);
    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), ":");
    /// ```
    pub fn set_position(&mut self, position: Position) {
        self.next_pos = position;
    }

    /// Consumes the next char.
    ///
    /// This method can be used to recover from a tokenization error.
    ///
    /// # Examples
    ///
    /// ```
    /// use erl_tokenize::Tokenizer;
    ///
    /// let src = r#"io:format("Hello")."#;
    ///
    /// let mut tokenizer = Tokenizer::new(src);
    /// assert_eq!(tokenizer.next_position().offset(), 0);
    ///
    /// tokenizer.consume_char();
    /// assert_eq!(tokenizer.next_position().offset(), 1);
    /// ```
    pub fn consume_char(&mut self) -> Option<char> {
        if let Some(c) = self.text.as_ref()[self.next_pos.offset()..].chars().next() {
            self.next_pos = self.next_pos.clone().step_by_char(c);
            Some(c)
        } else {
            None
        }
    }
}
impl<T> Iterator for Tokenizer<T>
where
    T: AsRef<str>,
{
    type Item = Result<Token>;
    fn next(&mut self) -> Option<Self::Item> {
        if self.next_pos.offset() >= self.text.as_ref().len() {
            None
        } else {
            let text = unsafe {
                self.text
                    .as_ref()
                    .get_unchecked(self.next_pos.offset()..self.text.as_ref().len())
            };
            let cur_pos = self.next_pos.clone();
            match Token::from_text(text, cur_pos) {
                Err(e) => Some(Err(e)),
                Ok(t) => {
                    self.next_pos = t.end_position();
                    Some(Ok(t))
                }
            }
        }
    }
}