erl_tokenize/
tokenizer.rs

1use std::path::Path;
2
3use crate::{Position, PositionRange, Result, Token};
4
5/// Tokenizer.
6///
7/// This is an iterator which tokenizes Erlang source code and iterates on the resulting tokens.
8///
9/// # Examples
10///
11/// ```
12/// use erl_tokenize::Tokenizer;
13///
14/// let src = r#"io:format("Hello")."#;
15/// let tokens = Tokenizer::new(src).collect::<Result<Vec<_>, _>>().unwrap();
16///
17/// assert_eq!(tokens.iter().map(|t| t.text()).collect::<Vec<_>>(),
18///            ["io", ":", "format", "(", r#""Hello""#, ")", "."]);
19/// ```
20#[derive(Debug)]
21pub struct Tokenizer<T> {
22    text: T,
23    next_pos: Position,
24}
25impl<T> Tokenizer<T>
26where
27    T: AsRef<str>,
28{
29    /// Makes a new `Tokenizer` instance which tokenize the Erlang source code text.
30    pub fn new(text: T) -> Self {
31        let init_pos = Position::new();
32        Tokenizer {
33            text,
34            next_pos: init_pos,
35        }
36    }
37
38    /// Sets the file path of the succeeding tokens.
39    pub fn set_filepath<P: AsRef<Path>>(&mut self, filepath: P) {
40        self.next_pos.set_filepath(filepath);
41    }
42
43    /// Returns the input text.
44    pub fn text(&self) -> &str {
45        self.text.as_ref()
46    }
47
48    /// Finishes tokenization and returns the target text.
49    pub fn finish(self) -> T {
50        self.text
51    }
52
53    /// Returns the cursor position from which this tokenizer will start to scan the next token.
54    ///
55    /// # Examples
56    ///
57    /// ```
58    /// use erl_tokenize::Tokenizer;
59    ///
60    /// let src = r#"io:format(
61    ///   "Hello")."#;
62    ///
63    /// let mut tokenizer = Tokenizer::new(src);
64    /// assert_eq!(tokenizer.next_position().offset(), 0);
65    ///
66    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), "io");
67    /// assert_eq!(tokenizer.next_position().offset(), 2);
68    /// tokenizer.next(); // ':'
69    /// tokenizer.next(); // 'format'
70    /// tokenizer.next(); // '('
71    /// tokenizer.next(); // '\n'
72    /// assert_eq!(tokenizer.next_position().offset(), 11);
73    /// assert_eq!(tokenizer.next_position().line(), 2);
74    /// assert_eq!(tokenizer.next_position().column(), 1);
75    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), " ");
76    /// assert_eq!(tokenizer.next_position().offset(), 12);
77    /// assert_eq!(tokenizer.next_position().line(), 2);
78    /// assert_eq!(tokenizer.next_position().column(), 2);
79    /// ```
80    pub fn next_position(&self) -> Position {
81        self.next_pos.clone()
82    }
83
84    /// Sets the current position.
85    ///
86    /// Note that it's the responsibility of the user to specify a valid position.
87    /// Otherwise, the following tokenization process will raise an error.
88    ///
89    /// # Examples
90    ///
91    /// ```
92    /// use erl_tokenize::Tokenizer;
93    ///
94    /// let src = r#"io:format(
95    ///   "Hello")."#;
96    ///
97    /// let mut tokenizer = Tokenizer::new(src);
98    /// assert_eq!(tokenizer.next_position().offset(), 0);
99    ///
100    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), "io");
101    ///
102    /// let position = tokenizer.next_position();
103    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), ":");
104    /// tokenizer.next(); // 'format'
105    /// tokenizer.next(); // '('
106    /// tokenizer.next(); // '\n'
107    ///
108    /// tokenizer.set_position(position);
109    /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), ":");
110    /// ```
111    pub fn set_position(&mut self, position: Position) {
112        self.next_pos = position;
113    }
114
115    /// Consumes the next char.
116    ///
117    /// This method can be used to recover from a tokenization error.
118    ///
119    /// # Examples
120    ///
121    /// ```
122    /// use erl_tokenize::Tokenizer;
123    ///
124    /// let src = r#"io:format("Hello")."#;
125    ///
126    /// let mut tokenizer = Tokenizer::new(src);
127    /// assert_eq!(tokenizer.next_position().offset(), 0);
128    ///
129    /// tokenizer.consume_char();
130    /// assert_eq!(tokenizer.next_position().offset(), 1);
131    /// ```
132    pub fn consume_char(&mut self) -> Option<char> {
133        if let Some(c) = self.text.as_ref()[self.next_pos.offset()..].chars().next() {
134            self.next_pos = self.next_pos.clone().step_by_char(c);
135            Some(c)
136        } else {
137            None
138        }
139    }
140}
141impl<T> Iterator for Tokenizer<T>
142where
143    T: AsRef<str>,
144{
145    type Item = Result<Token>;
146    fn next(&mut self) -> Option<Self::Item> {
147        if self.next_pos.offset() >= self.text.as_ref().len() {
148            None
149        } else {
150            let text = unsafe {
151                self.text
152                    .as_ref()
153                    .get_unchecked(self.next_pos.offset()..self.text.as_ref().len())
154            };
155            let cur_pos = self.next_pos.clone();
156            match Token::from_text(text, cur_pos) {
157                Err(e) => Some(Err(e)),
158                Ok(t) => {
159                    self.next_pos = t.end_position();
160                    Some(Ok(t))
161                }
162            }
163        }
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    // https://github.com/sile/erlls/issues/5
172    //
173    // v0.8.1 caused the following error:
174    // ```
175    // thread 'tokenizer::tests::erlls_issue_5' panicked at src/tokenizer.rs:133:44:
176    // byte index 32 is not a char boundary; it is inside '应' (bytes 31..34) of `-module(repro).
177    // -moduledoc """
178    // 应该报错
179    // "".`
180    // ```
181    #[test]
182    fn erlls_issue_5() {
183        let text = r#"-module(repro).
184-moduledoc """
185应该报错
186""."#;
187        let mut tokenizer = Tokenizer::new(text);
188        while let Some(token) = tokenizer.next() {
189            let Ok(_token) = token else {
190                tokenizer.consume_char();
191                continue;
192            };
193        }
194    }
195}