1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
use super::*;
use regex::Regex;
use std::cmp::Ordering;
use std::rc::Rc;
#[derive(Clone, Debug)]
pub(crate) struct TokenizerHelper {
/// The source of the file being parsed.
source: String,
/// The current index in the source, position in the file.
index: usize,
/// The current line number in the file,
/// equal to the amount of newlines found in the file up until now.
///
/// The first line of the source is line `0`, not `1`.
line_nr: usize,
/// The current column in the file. The amount of chars since last newline.
///
/// The first character of the line is `0`, not `1`.
column_nr: usize,
/// Unique identifier for creation of the tree.
/// Each node in the tree requires a unique id.
id_counter: u64,
tree: Rc<Tree>,
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) enum TokenMatchStatus {
/// We found the token in the next bytes.
Ok(DataNode),
/// We found the token, but there was something in front of it.
OkWithPrefixFound(DataNode, DataNode),
/// We found the end of the file. There will be nothing more in the future.
EoF,
/// We did not find a match for the regex.
NoMatch,
}
impl TokenizerHelper {
/// Create a new `TokenizerHelper` that will set the cursor at the first bytes of the source.
pub fn new(source: String, tree: &Rc<Tree>) -> Self {
Self {
source,
index: 0,
line_nr: 0,
column_nr: 0,
id_counter: 0,
tree: tree.clone(),
}
}
/// Check the next byte in the file and check if it matches the provided character.
/// Checks 1 byte with byte of character.
pub fn check_if_next_char_match(&self, character: char) -> bool {
let source_bytes = self.source.as_bytes();
if self.index >= source_bytes.len() {
return false;
}
let found_char = source_bytes[self.index];
found_char == (character as u8)
}
/// Check the next bytes in the file and check if it matches the sequence of characters.
/// Checks n bytes with listed bytes in order
pub fn check_if_next_chars_match(&self, chars: &[char]) -> bool {
let source_bytes = self.source.as_bytes();
let char_bytes: usize = chars.iter().map(|c| c.len_utf8()).sum();
if self.index + char_bytes > source_bytes.len() {
return false;
}
for (index, &character) in chars.iter().enumerate() {
if source_bytes[self.index + index] != (character as u8) {
return false;
}
}
true
}
/// Check the next byte in the file and check if it matches the any of the listed characters.
/// Check 1 byte matches any of listed bytes of characters.
pub fn check_if_next_char_matches_any_of(&self, chars: &[char]) -> bool {
let source_bytes = self.source.as_bytes();
if self.index >= source_bytes.len() {
return false;
}
for &character in chars {
if source_bytes[self.index] == (character as u8) {
return true;
}
}
false
}
/// Check if we reached EoF.
pub fn check_if_eof(&self) -> bool {
let source_bytes = self.source.as_bytes();
self.index >= source_bytes.len()
}
/// Look in the next bytes of the file and see if we can find a match for the regex rule.
/// Depending on the result the token will be created and can be used,
/// Or we could not find so you can take the appropriate action.
pub fn get_next_match(
&mut self,
regex: &Regex,
kind: &str,
name: Option<&str>,
optional: bool,
move_cursor: bool,
) -> TokenMatchStatus {
// check if string is not empty
if self.index >= self.source.len() {
return TokenMatchStatus::EoF;
}
// Look first match
let (_done, to_match) = self.source.split_at(self.index);
if let Some(mat) = regex.find(to_match) {
// Keep track of the start of the token.
let start_byte = self.index + mat.start();
let start_point = self.get_moved_point(start_byte).unwrap();
let mut prefix_found = false;
// Get all text before found match
let (prefix, _) = to_match.split_at(mat.start());
if !prefix.is_empty() {
if optional {
return TokenMatchStatus::NoMatch;
}
// Check if prefix is across multiple lines.
// State no match do avoid over extending token
// This is safe to assume because if it did not consume the newline,
// it will have skipped over to much. (so is shortcut)
// TODO take better look at lines with multiple tokens.
if prefix.contains('\n') {
return TokenMatchStatus::NoMatch;
}
log::error!("Found string before match in ({}): {}", kind, prefix);
prefix_found = true;
}
// Calculate new index
let new_index = self.index + mat.end();
let end_byte = new_index;
// Create prefix node
let prefix_found = match prefix_found {
true => {
let prefix_start_byte = self.index;
let prefix_end_byte = self.index + mat.start();
let mut prefix_node = self.create_tsnode(
prefix_start_byte,
self.get_point(),
prefix_end_byte,
self.get_moved_point(prefix_end_byte).unwrap(),
"ERROR",
Some("ERROR"),
);
prefix_node.kind_id = u16::MAX;
Some(prefix_node)
}
false => None,
};
// Get point and maybe move cursor
let end_point = if move_cursor {
self.move_index(new_index)
} else {
self.get_moved_point(new_index).unwrap()
};
// Create new token we just found.
let node = self.create_tsnode(start_byte, start_point, end_byte, end_point, kind, name);
match prefix_found {
Some(prefix) => TokenMatchStatus::OkWithPrefixFound(prefix, node),
None => TokenMatchStatus::Ok(node),
}
} else {
TokenMatchStatus::NoMatch
}
}
/// Create a new node for the tree. The node is not automatically added to the tree.
pub fn create_tsnode(
&mut self,
start_byte: usize,
start_point: Point,
end_byte: usize,
end_point: Point,
kind: &str,
name: Option<&str>,
) -> DataNode {
let name = match name {
Some(value) => Some(value.to_owned()),
None => None,
};
self.id_counter += 1;
DataNode {
id: self.id_counter,
kind_id: 0,
kind: kind.to_owned(),
name,
start_byte,
end_byte,
start_point,
end_point,
children_ids: vec![],
parent_id: None,
next_sibling_id: None,
prev_sibling_id: None,
tree: Rc::downgrade(&self.tree),
}
}
pub fn create_start_tsnode(&mut self, kind: &str, name: Option<&str>) -> DataNode {
let name = match name {
Some(value) => Some(value.to_owned()),
None => None,
};
self.id_counter += 1;
DataNode {
id: self.id_counter,
kind_id: 0,
kind: kind.to_owned(),
name,
start_byte: self.index,
end_byte: self.index,
start_point: self.get_point(),
end_point: self.get_point(),
children_ids: vec![],
parent_id: None,
next_sibling_id: None,
prev_sibling_id: None,
tree: Rc::downgrade(&self.tree),
}
}
pub fn set_end_point_for(&self, k: u64) {
if let Some(mut node) = self.tree.get_tsnode(k) {
node.end_byte = self.index;
node.end_point = self.get_point();
self.tree.update_node(k, node);
}
}
/// Add a previously created node to the tree, in a particular position.
pub fn add_node_to_tree(&self, node: DataNode, parent_id: u64) {
let node_id = node.id;
self.tree.add_tsnode(node);
self.tree.add_child_to_node(parent_id, node_id);
}
/// Get the current point in the code.
pub fn get_point(&self) -> Point {
Point {
row: self.line_nr,
column: self.column_nr,
}
}
/// Get the index of the EOF bytes.
pub fn get_eof_index(&self) -> usize {
self.source.as_bytes().len()
}
/// Move the index further in the file, moving backwards is not possible.
pub fn move_index(&mut self, new_index: usize) -> Point {
let new_point = self.get_moved_point(new_index).unwrap();
self.line_nr = new_point.row;
self.column_nr = new_point.column;
self.index = new_index;
new_point
}
/// Move the index directly, but be careful this might break things.
/// Make sure the index and point match, if they do not match this might give incorrect
/// diagnostics in the future.
pub fn direct_mode_index_and_point(&mut self, new_index: usize, new_point: Point) {
if new_index < self.index {
log::error!(
"Can not move index backwards: old:{}, new:{}",
self.index,
new_index
);
} else {
self.line_nr = new_point.row;
self.column_nr = new_point.column;
self.index = new_index;
}
}
/// Get raw string text from data node.
/// The text must be UTF-8 encoded.
pub fn data_node_text(&self, data_node: &DataNode) -> String {
data_node.get_text(&self.source)
}
/// Calculate the new cursor position.
fn get_moved_point(&self, new_index: usize) -> Result<Point, ()> {
match new_index.cmp(&self.index) {
Ordering::Greater => {}
Ordering::Less => {
log::error!(
"Can not move index backwards: old:{}, new:{}",
self.index,
new_index
);
return Err(());
}
Ordering::Equal => {
return Ok(self.get_point());
}
}
let (_done, to_match) = self.source.split_at(self.index);
let (current, _future) = to_match.split_at(new_index - self.index);
// log::debug!("Move past: {}", current);
// Count amount of newlines
// TODO: Count uses about 33% of CPU use of `get_next_match()`.
let amount_of_newlines = current.matches('\n').count();
let new_line_nr = self.line_nr + amount_of_newlines;
// Count char (not bytes) since last newline
let new_column_nr = if let Some((text, _last_line)) = current.rsplit_once('\n') {
current.chars().count() - text.chars().count() - 1
} else {
// no newline parsed, add amount of chars
self.column_nr + current.chars().count()
};
// log::debug!(
// "Cursor pos: {} - {}:{}",
// self.index, self.line_nr, self.column_nr
// );
Ok(Point {
row: new_line_nr,
column: new_column_nr,
})
}
}