erl_tokenize/tokenizer.rs
1use std::path::Path;
2
3use crate::{Position, PositionRange, Result, Token};
4
5/// Tokenizer.
6///
7/// This is an iterator which tokenizes Erlang source code and iterates on the resulting tokens.
8///
9/// # Examples
10///
11/// ```
12/// use erl_tokenize::Tokenizer;
13///
14/// let src = r#"io:format("Hello")."#;
15/// let tokens = Tokenizer::new(src).collect::<Result<Vec<_>, _>>().unwrap();
16///
17/// assert_eq!(tokens.iter().map(|t| t.text()).collect::<Vec<_>>(),
18/// ["io", ":", "format", "(", r#""Hello""#, ")", "."]);
19/// ```
20#[derive(Debug)]
21pub struct Tokenizer<T> {
22 text: T,
23 next_pos: Position,
24}
25impl<T> Tokenizer<T>
26where
27 T: AsRef<str>,
28{
29 /// Makes a new `Tokenizer` instance which tokenize the Erlang source code text.
30 pub fn new(text: T) -> Self {
31 let init_pos = Position::new();
32 Tokenizer {
33 text,
34 next_pos: init_pos,
35 }
36 }
37
38 /// Sets the file path of the succeeding tokens.
39 pub fn set_filepath<P: AsRef<Path>>(&mut self, filepath: P) {
40 self.next_pos.set_filepath(filepath);
41 }
42
43 /// Returns the input text.
44 pub fn text(&self) -> &str {
45 self.text.as_ref()
46 }
47
48 /// Finishes tokenization and returns the target text.
49 pub fn finish(self) -> T {
50 self.text
51 }
52
53 /// Returns the cursor position from which this tokenizer will start to scan the next token.
54 ///
55 /// # Examples
56 ///
57 /// ```
58 /// use erl_tokenize::Tokenizer;
59 ///
60 /// let src = r#"io:format(
61 /// "Hello")."#;
62 ///
63 /// let mut tokenizer = Tokenizer::new(src);
64 /// assert_eq!(tokenizer.next_position().offset(), 0);
65 ///
66 /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), "io");
67 /// assert_eq!(tokenizer.next_position().offset(), 2);
68 /// tokenizer.next(); // ':'
69 /// tokenizer.next(); // 'format'
70 /// tokenizer.next(); // '('
71 /// tokenizer.next(); // '\n'
72 /// assert_eq!(tokenizer.next_position().offset(), 11);
73 /// assert_eq!(tokenizer.next_position().line(), 2);
74 /// assert_eq!(tokenizer.next_position().column(), 1);
75 /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), " ");
76 /// assert_eq!(tokenizer.next_position().offset(), 12);
77 /// assert_eq!(tokenizer.next_position().line(), 2);
78 /// assert_eq!(tokenizer.next_position().column(), 2);
79 /// ```
80 pub fn next_position(&self) -> Position {
81 self.next_pos.clone()
82 }
83
84 /// Sets the current position.
85 ///
86 /// Note that it's the responsibility of the user to specify a valid position.
87 /// Otherwise, the following tokenization process will raise an error.
88 ///
89 /// # Examples
90 ///
91 /// ```
92 /// use erl_tokenize::Tokenizer;
93 ///
94 /// let src = r#"io:format(
95 /// "Hello")."#;
96 ///
97 /// let mut tokenizer = Tokenizer::new(src);
98 /// assert_eq!(tokenizer.next_position().offset(), 0);
99 ///
100 /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), "io");
101 ///
102 /// let position = tokenizer.next_position();
103 /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), ":");
104 /// tokenizer.next(); // 'format'
105 /// tokenizer.next(); // '('
106 /// tokenizer.next(); // '\n'
107 ///
108 /// tokenizer.set_position(position);
109 /// assert_eq!(tokenizer.next().unwrap().map(|t| t.text().to_owned()).unwrap(), ":");
110 /// ```
111 pub fn set_position(&mut self, position: Position) {
112 self.next_pos = position;
113 }
114
115 /// Consumes the next char.
116 ///
117 /// This method can be used to recover from a tokenization error.
118 ///
119 /// # Examples
120 ///
121 /// ```
122 /// use erl_tokenize::Tokenizer;
123 ///
124 /// let src = r#"io:format("Hello")."#;
125 ///
126 /// let mut tokenizer = Tokenizer::new(src);
127 /// assert_eq!(tokenizer.next_position().offset(), 0);
128 ///
129 /// tokenizer.consume_char();
130 /// assert_eq!(tokenizer.next_position().offset(), 1);
131 /// ```
132 pub fn consume_char(&mut self) -> Option<char> {
133 if let Some(c) = self.text.as_ref()[self.next_pos.offset()..].chars().next() {
134 self.next_pos = self.next_pos.clone().step_by_char(c);
135 Some(c)
136 } else {
137 None
138 }
139 }
140}
141impl<T> Iterator for Tokenizer<T>
142where
143 T: AsRef<str>,
144{
145 type Item = Result<Token>;
146 fn next(&mut self) -> Option<Self::Item> {
147 if self.next_pos.offset() >= self.text.as_ref().len() {
148 None
149 } else {
150 let text = unsafe {
151 self.text
152 .as_ref()
153 .get_unchecked(self.next_pos.offset()..self.text.as_ref().len())
154 };
155 let cur_pos = self.next_pos.clone();
156 match Token::from_text(text, cur_pos) {
157 Err(e) => Some(Err(e)),
158 Ok(t) => {
159 self.next_pos = t.end_position();
160 Some(Ok(t))
161 }
162 }
163 }
164 }
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170
171 // https://github.com/sile/erlls/issues/5
172 //
173 // v0.8.1 caused the following error:
174 // ```
175 // thread 'tokenizer::tests::erlls_issue_5' panicked at src/tokenizer.rs:133:44:
176 // byte index 32 is not a char boundary; it is inside '应' (bytes 31..34) of `-module(repro).
177 // -moduledoc """
178 // 应该报错
179 // "".`
180 // ```
181 #[test]
182 fn erlls_issue_5() {
183 let text = r#"-module(repro).
184-moduledoc """
185应该报错
186""."#;
187 let mut tokenizer = Tokenizer::new(text);
188 while let Some(token) = tokenizer.next() {
189 let Ok(_token) = token else {
190 tokenizer.consume_char();
191 continue;
192 };
193 }
194 }
195}