1use std::{iter::Peekable, slice::Iter};
8
9use crate::{
10 compiler::{CompileError, ErrorType, Number},
11 runtime::eval::IntoString,
12 Compiler,
13};
14
15use super::{word::lookup_words, StringConstant, Token};
16
17pub(crate) struct Tokenizer<'x> {
18 pub compiler: &'x Compiler,
19 pub iter: Peekable<Iter<'x, u8>>,
20 pub buf: Vec<u8>,
21 pub next_token: Vec<TokenInfo>,
22
23 pub pos: usize,
24 pub line_num: usize,
25 pub line_start: usize,
26
27 pub text_line_num: usize,
28 pub text_line_pos: usize,
29
30 pub token_line_num: usize,
31 pub token_line_pos: usize,
32
33 pub token_is_tag: bool,
34
35 pub last_ch: u8,
36 pub state: State,
37}
38
39#[derive(Debug)]
40pub(crate) struct TokenInfo {
41 pub(crate) token: Token,
42 pub(crate) line_num: usize,
43 pub(crate) line_pos: usize,
44}
45
46pub(crate) enum State {
47 None,
48 BracketComment,
49 HashComment,
50 QuotedString(StringType),
51 MultiLine(StringType),
52}
53
54#[derive(Clone, Copy, Default)]
55pub(crate) struct StringType {
56 maybe_variable: bool,
57 has_other: bool,
58 has_digits: bool,
59 has_dots: bool,
60}
61
62impl<'x> Tokenizer<'x> {
63 pub fn new(compiler: &'x Compiler, bytes: &'x [u8]) -> Self {
64 Tokenizer {
65 compiler,
66 iter: bytes.iter().peekable(),
67 buf: Vec::with_capacity(bytes.len() / 2),
68 pos: usize::MAX,
69 line_num: 1,
70 line_start: 0,
71 text_line_num: 0,
72 text_line_pos: 0,
73 token_line_num: 0,
74 token_line_pos: 0,
75 token_is_tag: false,
76 next_token: Vec::with_capacity(2),
77 last_ch: 0,
78 state: State::None,
79 }
80 }
81
82 pub fn get_current_token(&mut self) -> Option<TokenInfo> {
83 if !self.buf.is_empty() {
84 let word = std::str::from_utf8(&self.buf).unwrap();
85 let token = if let Some(word) = lookup_words(word) {
86 if self.token_is_tag {
87 self.token_line_pos -= 1;
88 Token::Tag(word)
89 } else {
90 Token::Identifier(word)
91 }
92 } else if self.buf.first().unwrap().is_ascii_digit() {
93 let multiplier = match self.buf.last().unwrap() {
94 b'k' => 1024,
95 b'm' => 1048576,
96 b'g' => 1073741824,
97 _ => 1,
98 };
99
100 if let Ok(number) = (if multiplier > 1 && self.buf.len() > 1 {
101 std::str::from_utf8(&self.buf[..self.buf.len() - 1]).unwrap()
102 } else {
103 word
104 })
105 .parse::<usize>()
106 {
107 Token::Number(number.saturating_mul(multiplier))
108 } else if self.token_is_tag {
109 Token::Unknown(format!(":{word}"))
110 } else {
111 Token::Unknown(word.to_string())
112 }
113 } else if self.token_is_tag {
114 Token::Unknown(format!(":{word}"))
115 } else {
116 Token::Unknown(word.to_string())
117 };
118
119 self.reset_current_token();
120
121 Some(TokenInfo {
122 token,
123 line_num: self.token_line_num,
124 line_pos: self.token_line_pos,
125 })
126 } else {
127 None
128 }
129 }
130
131 #[inline(always)]
132 pub fn reset_current_token(&mut self) {
133 self.buf.clear();
134 self.token_is_tag = false;
135 }
136
137 #[inline(always)]
138 pub fn token_is_tag(&mut self) {
139 self.token_is_tag = true;
140 }
141
142 pub fn get_token(&mut self, token: Token) -> TokenInfo {
143 let next_token = TokenInfo {
144 token,
145 line_num: self.line_num,
146 line_pos: self.pos - self.line_start,
147 };
148 if let Some(token) = self.get_current_token() {
149 self.next_token.push(next_token);
150 token
151 } else {
152 next_token
153 }
154 }
155
156 pub fn get_string(&mut self, str_type: StringType) -> Result<TokenInfo, CompileError> {
157 if self.buf.len() < self.compiler.max_string_size {
158 let token = if str_type.maybe_variable {
159 Token::StringVariable(self.buf.to_vec())
160 } else {
161 let constant = self.buf.to_vec().into_string();
162 if !str_type.has_other && str_type.has_digits {
163 if !str_type.has_dots {
164 if let Some(number) = constant.parse::<i64>().ok().and_then(|n| {
165 if n.to_string() == constant {
166 Some(n)
167 } else {
168 None
169 }
170 }) {
171 Token::StringConstant(StringConstant::Number(Number::Integer(number)))
172 } else {
173 Token::StringConstant(StringConstant::String(constant))
174 }
175 } else if let Some(number) = constant.parse::<f64>().ok().and_then(|n| {
176 if n.to_string() == constant {
177 Some(n)
178 } else {
179 None
180 }
181 }) {
182 Token::StringConstant(StringConstant::Number(Number::Float(number)))
183 } else {
184 Token::StringConstant(StringConstant::String(constant))
185 }
186 } else {
187 Token::StringConstant(StringConstant::String(constant))
188 }
189 };
190
191 self.buf.clear();
192
193 Ok(TokenInfo {
194 token,
195 line_num: self.text_line_num,
196 line_pos: self.text_line_pos,
197 })
198 } else {
199 Err(CompileError {
200 line_num: self.text_line_num,
201 line_pos: self.text_line_pos,
202 error_type: ErrorType::StringTooLong,
203 })
204 }
205 }
206
207 #[inline(always)]
208 pub fn push_byte(&mut self, ch: u8) {
209 if self.buf.is_empty() {
210 self.token_line_num = self.line_num;
211 self.token_line_pos = self.pos - self.line_start;
212 }
213 self.buf.push(ch);
214 }
215
216 #[inline(always)]
217 pub fn new_line(&mut self) {
218 self.line_num += 1;
219 self.line_start = self.pos;
220 }
221
222 #[inline(always)]
223 pub fn text_start(&mut self) {
224 self.text_line_num = self.line_num;
225 self.text_line_pos = self.pos - self.line_start;
226 }
227
228 #[inline(always)]
229 pub fn is_token_start(&self) -> bool {
230 self.buf.is_empty()
231 }
232
233 #[inline(always)]
234 pub fn token_bytes(&self) -> &[u8] {
235 &self.buf
236 }
237
238 #[inline(always)]
239 pub fn next_byte(&mut self) -> Option<(u8, u8)> {
240 self.iter.next().map(|&ch| {
241 let last_ch = self.last_ch;
242 self.pos = self.pos.wrapping_add(1);
243 self.last_ch = ch;
244 (ch, last_ch)
245 })
246 }
247
248 #[inline(always)]
249 pub fn peek_byte(&mut self) -> Option<u8> {
250 self.iter.peek().map(|ch| **ch)
251 }
252
253 pub fn unwrap_next(&mut self) -> Result<TokenInfo, CompileError> {
254 if let Some(token) = self.next() {
255 token
256 } else {
257 Err(CompileError {
258 line_num: self.line_num,
259 line_pos: self.pos - self.line_start,
260 error_type: ErrorType::UnexpectedEOF,
261 })
262 }
263 }
264
265 pub fn expect_token(&mut self, token: Token) -> Result<(), CompileError> {
266 let next_token = self.unwrap_next()?;
267 if next_token.token == token {
268 Ok(())
269 } else {
270 Err(next_token.expected(format!("'{token}'")))
271 }
272 }
273
274 pub fn expect_static_string(&mut self) -> Result<String, CompileError> {
275 let next_token = self.unwrap_next()?;
276 match next_token.token {
277 Token::StringConstant(s) => Ok(s.into_string()),
278 Token::BracketOpen => {
279 let mut string = None;
280 loop {
281 let token_info = self.unwrap_next()?;
282 match token_info.token {
283 Token::StringConstant(string_) => {
284 string = string_.into();
285 }
286 Token::BracketClose if string.is_some() => break,
287 _ => return Err(token_info.expected("constant string")),
288 }
289 }
290 Ok(string.unwrap().into_string())
291 }
292 _ => Err(next_token.expected("constant string")),
293 }
294 }
295
296 pub fn expect_number(&mut self, max_value: usize) -> Result<usize, CompileError> {
297 let next_token = self.unwrap_next()?;
298 if let Token::Number(n) = next_token.token {
299 if n < max_value {
300 Ok(n)
301 } else {
302 Err(next_token.expected(format!("number lower than {max_value}")))
303 }
304 } else {
305 Err(next_token.expected("number"))
306 }
307 }
308
309 pub fn invalid_character(&self) -> CompileError {
310 CompileError {
311 line_num: self.line_num,
312 line_pos: self.pos - self.line_start,
313 error_type: ErrorType::InvalidCharacter(self.last_ch),
314 }
315 }
316
317 pub fn peek(&mut self) -> Option<Result<&TokenInfo, CompileError>> {
318 if self.next_token.is_empty() {
319 match self.next()? {
320 Ok(next_token) => self.next_token.push(next_token),
321 Err(err) => return Some(Err(err)),
322 }
323 }
324 self.next_token.last().map(Ok)
325 }
326}
327
328impl Iterator for Tokenizer<'_> {
329 type Item = Result<TokenInfo, CompileError>;
330
331 fn next(&mut self) -> Option<Self::Item> {
332 if let Some(prev_token) = self.next_token.pop() {
333 return Some(Ok(prev_token));
334 }
335
336 'outer: while let Some((ch, last_ch)) = self.next_byte() {
337 match self.state {
338 State::None => match ch {
339 b'a'..=b'z' | b'0'..=b'9' | b'_' | b'.' | b'$' => {
340 self.push_byte(ch);
341 }
342 b'A'..=b'Z' => {
343 self.push_byte(ch.to_ascii_lowercase());
344 }
345 b':' => {
346 if self.is_token_start()
347 && matches!(self.peek_byte(), Some(b) if b.is_ascii_alphabetic())
348 {
349 self.token_is_tag();
350 } else if self.token_bytes().eq_ignore_ascii_case(b"text") {
351 self.state = State::MultiLine(StringType::default());
352 self.text_start();
353 while let Some((ch, _)) = self.next_byte() {
354 if ch == b'\n' {
355 self.new_line();
356 self.reset_current_token();
357 continue 'outer;
358 }
359 }
360 } else {
361 return Some(Ok(self.get_token(Token::Colon)));
362 }
364 }
365 b'"' => {
366 self.state = State::QuotedString(StringType::default());
367 self.text_start();
368 if let Some(token) = self.get_current_token() {
369 return Some(Ok(token));
370 }
371 }
372 b'{' => {
373 return Some(Ok(self.get_token(Token::CurlyOpen)));
374 }
375 b'}' => {
376 return Some(Ok(self.get_token(Token::CurlyClose)));
377 }
378 b';' => {
379 return Some(Ok(self.get_token(Token::Semicolon)));
380 }
381 b',' => {
382 return Some(Ok(self.get_token(Token::Comma)));
383 }
384 b'[' => {
385 return Some(Ok(self.get_token(Token::BracketOpen)));
386 }
387 b']' => {
388 return Some(Ok(self.get_token(Token::BracketClose)));
389 }
390 b'(' => {
391 return Some(Ok(self.get_token(Token::ParenthesisOpen)));
392 }
393 b')' => {
394 return Some(Ok(self.get_token(Token::ParenthesisClose)));
395 }
396 b'/' => {
397 if let Some((b'*', _)) = self.next_byte() {
398 self.last_ch = 0;
399 self.state = State::BracketComment;
400 self.text_start();
401 if let Some(token) = self.get_current_token() {
402 return Some(Ok(token));
403 }
404 } else {
405 return Some(Err(self.invalid_character()));
406 }
407 }
408 b'#' => {
409 self.state = State::HashComment;
410 if let Some(token) = self.get_current_token() {
411 return Some(Ok(token));
412 }
413 }
414 b'\n' => {
415 self.new_line();
416 if let Some(token) = self.get_current_token() {
417 return Some(Ok(token));
418 }
419 }
420 b' ' | b'\t' | b'\r' => {
421 if let Some(token) = self.get_current_token() {
422 return Some(Ok(token));
423 }
424 }
425 _ => {
426 return Some(Err(self.invalid_character()));
427 }
428 },
429 State::BracketComment { .. } => match ch {
430 b'/' if last_ch == b'*' => {
431 self.state = State::None;
432 }
433 b'\n' => {
434 self.new_line();
435 }
436 _ => (),
437 },
438 State::HashComment => {
439 if ch == b'\n' {
440 self.state = State::None;
441 self.new_line();
442 }
443 }
444 State::QuotedString(mut str_type) => match ch {
445 b'"' if last_ch != b'\\' => {
446 self.state = State::None;
447 return Some(self.get_string(str_type));
448 }
449 b'\n' => {
450 self.new_line();
451 self.push_byte(b'\n');
452 str_type.has_other = true;
453 self.state = State::QuotedString(str_type);
454 }
455 b'{' if (last_ch == b'$' || last_ch == b'%') => {
456 str_type.maybe_variable = true;
457 self.state = State::QuotedString(str_type);
458 self.push_byte(ch);
459 }
460 b'\\' => {
461 if last_ch == b'\\' {
462 self.push_byte(ch);
463 }
464 }
465 b'0'..=b'9' => {
466 if !str_type.has_digits {
467 str_type.has_digits = true;
468 self.state = State::QuotedString(str_type);
469 }
470 self.push_byte(ch);
471 }
472 b'.' => {
473 if !str_type.has_dots {
474 str_type.has_dots = true;
475 } else {
476 str_type.has_other = true;
477 }
478 self.state = State::QuotedString(str_type);
479 self.push_byte(ch);
480 }
481 _ => {
482 let ch = if last_ch == b'\\' {
483 match ch {
484 b'n' => b'\n',
485 b'r' => b'\r',
486 b't' => b'\t',
487 _ => ch,
488 }
489 } else {
490 ch
491 };
492 if !str_type.has_other && ch != b'-' {
493 str_type.has_other = true;
494 self.state = State::QuotedString(str_type);
495 }
496 self.push_byte(ch);
497 }
498 },
499 State::MultiLine(mut str_type) => match ch {
500 b'.' if last_ch == b'\n' => {
501 let is_eof = match (self.next_byte(), self.peek_byte()) {
502 (Some((b'\r', _)), Some(b'\n')) => {
503 self.next_byte();
504 true
505 }
506 (Some((b'\n', _)), _) => true,
507 (Some((b'.', _)), _) => {
508 self.push_byte(b'.');
509 false
510 }
511 (Some((ch, _)), _) => {
512 self.push_byte(b'.');
513 self.push_byte(ch);
514 false
515 }
516 _ => false,
517 };
518
519 if is_eof {
520 self.new_line();
521 self.state = State::None;
522 return Some(self.get_string(str_type));
523 }
524 }
525 b'\n' => {
526 self.new_line();
527 self.push_byte(b'\n');
528 }
529 b'{' if (last_ch == b'$' || last_ch == b'%') => {
530 str_type.maybe_variable = true;
531 self.state = State::MultiLine(str_type);
532 self.push_byte(ch);
533 }
534 b'0'..=b'9' => {
535 if !str_type.has_digits {
536 str_type.has_digits = true;
537 self.state = State::MultiLine(str_type);
538 }
539 self.push_byte(ch);
540 }
541 b'.' => {
542 if !str_type.has_dots {
543 str_type.has_dots = true;
544 } else {
545 str_type.has_other = true;
546 }
547 self.state = State::MultiLine(str_type);
548 self.push_byte(ch);
549 }
550 _ => {
551 if !str_type.has_other && ch != b'-' {
552 str_type.has_other = true;
553 self.state = State::MultiLine(str_type);
554 }
555 self.push_byte(ch);
556 }
557 },
558 }
559 }
560
561 match self.state {
562 State::BracketComment | State::QuotedString(_) | State::MultiLine(_) => {
563 Some(Err(CompileError {
564 line_num: self.text_line_num,
565 line_pos: self.text_line_pos,
566 error_type: (&self.state).into(),
567 }))
568 }
569 _ => None,
570 }
571 }
572}
573
574impl From<&State> for ErrorType {
575 fn from(state: &State) -> Self {
576 match state {
577 State::BracketComment => ErrorType::UnterminatedComment,
578 State::QuotedString(_) => ErrorType::UnterminatedString,
579 State::MultiLine(_) => ErrorType::UnterminatedMultiline,
580 _ => unreachable!(),
581 }
582 }
583}