1use crate::result::*;
2use serde::{Deserialize, Serialize};
3use std::fmt;
4
5#[derive(Clone, Debug, PartialEq)]
6pub enum Token {
7 Ident(String),
8 String(Vec<u8>),
9 Int(String),
10 CommentLine(String),
11 CommentBlock(String),
12 Type,
13 Equal,
14 As,
15 Import,
16 Comma,
17 From,
18 ImportFfi,
19 Transform,
20 Function,
21 Const,
22 DotDot,
23 Dot,
24 Elvis,
25 U8,
26 U16,
27 U32,
28 U64,
29 U128,
30 I8,
31 I16,
32 I32,
33 I64,
34 I128,
35 F32,
36 F64,
37 Bool,
38 Lt,
39 Gt,
40 Arrow,
41 Container,
42 LeftSquare,
43 RightSquare,
44 LeftCurly,
45 RightCurly,
46 Enum,
47 Bitfield,
48 LtEq,
49 GtEq,
50 Eq,
51 Ne,
52 Question,
53 Colon,
54 DoubleColon,
55 Semicolon,
56 Plus,
57 Minus,
58 Mul,
59 Div,
60 Mod,
61 Not,
62 LeftParen,
63 RightParen,
64 Cast,
65 Or,
66 And,
67 BitOr,
68 BitXor,
69 BitAnd,
70 Shr,
71 Shl,
72 ShrSigned,
73 BitNot,
74 True,
75 False,
76}
77
78impl fmt::Display for Token {
79 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80 use Token::*;
81 match self {
82 Ident(s) => write!(f, "{}", s),
83 String(s) => write!(f, "\"{}\"", std::string::String::from_utf8_lossy(&s[..])), Int(s) => write!(f, "{}", s),
85 CommentLine(s) => write!(f, "//{}\n", s),
86 CommentBlock(s) => write!(f, "/*{}*/ ", s),
87 Type => write!(f, "type "),
88 Equal => write!(f, "= "),
89 As => write!(f, "as "),
90 Import => write!(f, "import "),
91 Comma => write!(f, ","),
92 From => write!(f, "from "),
93 ImportFfi => write!(f, "import_ffi "),
94 Transform => write!(f, "transform "),
95 Function => write!(f, "function "),
96 Const => write!(f, "const "),
97 DotDot => write!(f, ".. "),
98 Dot => write!(f, ". "),
99 Elvis => write!(f, "?: "),
100 U8 => write!(f, "u8 "),
101 U16 => write!(f, "u16 "),
102 U32 => write!(f, "u32 "),
103 U64 => write!(f, "u64 "),
104 U128 => write!(f, "u128 "),
105 I8 => write!(f, "i8 "),
106 I16 => write!(f, "i16 "),
107 I32 => write!(f, "i32 "),
108 I64 => write!(f, "i64 "),
109 I128 => write!(f, "i128 "),
110 F32 => write!(f, "f32 "),
111 F64 => write!(f, "f64 "),
112 Bool => write!(f, "bool "),
113 Lt => write!(f, "< "),
114 Gt => write!(f, "> "),
115 Arrow => write!(f, "-> "),
116 Container => write!(f, "container "),
117 LeftSquare => write!(f, "["),
118 RightSquare => write!(f, "]"),
119 LeftCurly => write!(f, "{{"),
120 RightCurly => write!(f, "}}"),
121 Enum => write!(f, "enum "),
122 Bitfield => write!(f, "bitfield "),
123 LtEq => write!(f, "<= "),
124 GtEq => write!(f, ">= "),
125 Eq => write!(f, "== "),
126 Ne => write!(f, "!= "),
127 Question => write!(f, "?"),
128 Colon => write!(f, ":"),
129 DoubleColon => write!(f, "::"),
130 Semicolon => write!(f, ";"),
131 Plus => write!(f, "+"),
132 Minus => write!(f, "-"),
133 Mul => write!(f, "*"),
134 Div => write!(f, "/ "),
135 Mod => write!(f, "%"),
136 Not => write!(f, "! "),
137 LeftParen => write!(f, "("),
138 RightParen => write!(f, ")"),
139 Cast => write!(f, ":> "),
140 Or => write!(f, "|| "),
141 And => write!(f, "&& "),
142 BitOr => write!(f, "| "),
143 BitXor => write!(f, "^"),
144 BitAnd => write!(f, "& "),
145 Shr => write!(f, ">> "),
146 Shl => write!(f, "<< "),
147 ShrSigned => write!(f, ">>> "),
148 BitNot => write!(f, "~"),
149 True => write!(f, "true "),
150 False => write!(f, "false "),
151 }
152 }
153}
154
155fn eat<'a>(input: &'a [u8], wanted: &str) -> Option<&'a [u8]> {
156 let wanted = wanted.as_bytes();
157 if input.len() < wanted.len() {
158 return None;
159 }
160 if &input[0..wanted.len()] == wanted {
161 return Some(&input[wanted.len()..]);
162 }
163 None
164}
165
166fn eat_identifier(input: &[u8]) -> Option<(&[u8], &[u8])> {
167 if input.len() == 0 {
168 return None;
169 }
170 if !input[0].is_ascii_alphabetic() && input[0] != b'_' {
171 return None;
172 }
173 let mut i = 1usize;
174 while i < input.len() {
175 if !input[i].is_ascii_alphanumeric() && input[i] != b'_' {
176 break;
177 }
178 i += 1;
179 }
180 Some((&input[0..i], &input[i..]))
181}
182
183impl Token {
184 fn gobble(input: &[u8]) -> (&[u8], Option<Token>) {
185 if input.len() == 0 {
186 return (input, None);
187 }
188 match input[0] {
189 x if x.is_ascii_whitespace() => return (&input[1..], None),
190 b'"' => {
191 let mut i = 1;
192 let mut out = vec![];
193 while i < input.len() {
194 if input[i] == b'\\' && i < input.len() - 1 {
195 i += 1;
196 if input[i].is_ascii_hexdigit() {
197 if i < input.len() - 1 && input[i + 1].is_ascii_hexdigit() {
198 i += 1;
199 out.push(
200 u8::from_str_radix(
201 std::str::from_utf8(&input[i..i + 2]).unwrap(),
202 16,
203 )
204 .unwrap(),
205 );
206 } else {
207 out.push(
208 u8::from_str_radix(
209 std::str::from_utf8(&input[i..i + 1]).unwrap(),
210 16,
211 )
212 .unwrap(),
213 );
214 }
215 } else {
216 out.push(input[i]);
217 }
218 i += 1;
219 if i == input.len() {
220 return (input, None);
221 }
222 continue;
223 } else if input[i] == b'"' {
224 break;
225 }
226 out.push(input[i]);
227 i += 1;
228 }
229 if i == input.len() {
230 return (input, None);
231 }
232 return (&input[(i + 1)..], Some(Token::String(out)));
233 }
234 x if x.is_ascii_digit() => {
235 let mut i = 1;
236 let mut is_hex = false;
237 while i < input.len() {
238 if i == 1 && input[0] == b'0' && input[i] == b'x' {
239 is_hex = true;
240 i += 1;
241 continue;
242 }
243 if is_hex {
244 if !input[i].is_ascii_hexdigit() {
245 break;
246 }
247 } else {
248 if !input[i].is_ascii_digit() {
249 break;
250 }
251 }
252
253 i += 1;
254 }
255 return (
256 &input[i..],
257 Some(Token::Int(
258 String::from_utf8(input[0..i].to_vec()).unwrap_or_default(),
259 )),
260 );
261 }
262 b'=' => {
263 if let Some(input) = eat(input, "==") {
264 return (input, Some(Token::Eq));
265 } else {
266 return (&input[1..], Some(Token::Equal));
267 }
268 }
269 b',' => return (&input[1..], Some(Token::Comma)),
270 b';' => return (&input[1..], Some(Token::Semicolon)),
271 b'?' => {
272 if let Some(input) = eat(input, "?:") {
273 return (input, Some(Token::Elvis));
274 } else {
275 return (&input[1..], Some(Token::Question));
276 }
277 }
278 b'[' => return (&input[1..], Some(Token::LeftSquare)),
279 b']' => return (&input[1..], Some(Token::RightSquare)),
280 b'{' => return (&input[1..], Some(Token::LeftCurly)),
281 b'}' => return (&input[1..], Some(Token::RightCurly)),
282 b'(' => return (&input[1..], Some(Token::LeftParen)),
283 b')' => return (&input[1..], Some(Token::RightParen)),
284 b'+' => return (&input[1..], Some(Token::Plus)),
285 b'*' => return (&input[1..], Some(Token::Mul)),
286 b'%' => return (&input[1..], Some(Token::Mod)),
287 b'^' => return (&input[1..], Some(Token::BitXor)),
288 b'~' => return (&input[1..], Some(Token::BitNot)),
289 b'|' => {
290 if let Some(input) = eat(input, "||") {
291 return (input, Some(Token::Or));
292 } else {
293 return (&input[1..], Some(Token::BitOr));
294 }
295 }
296 b'/' => {
297 if let Some(input) = eat(input, "//") {
298 let eol = input.iter().position(|x| *x == b'\n');
299 let (input, comment) = if let Some(eol) = eol {
300 (&input[(eol + 1)..], &input[..eol])
301 } else {
302 (&input[input.len()..input.len()], &input[..])
303 };
304 return (
305 input,
306 Some(Token::CommentLine(
307 String::from_utf8_lossy(comment).to_string(),
308 )),
309 );
310 } else if let Some(input) = eat(input, "/*") {
311 if input.len() == 0 {
312 return (input, None);
313 }
314 let eol = input.windows(2).position(|x| x[0] == b'*' && x[1] == b'/');
315 let (input, comment) = if let Some(eol) = eol {
316 (&input[(eol + 2)..], &input[..eol])
317 } else {
318 (&input[input.len()..input.len()], &input[..])
319 };
320 return (
321 input,
322 Some(Token::CommentBlock(
323 String::from_utf8_lossy(comment).to_string(),
324 )),
325 );
326 } else {
327 return (&input[1..], Some(Token::Div));
328 }
329 }
330 b'&' => {
331 if let Some(input) = eat(input, "&&") {
332 return (input, Some(Token::And));
333 } else {
334 return (&input[1..], Some(Token::BitAnd));
335 }
336 }
337 b'.' => {
338 if let Some(input) = eat(input, "..") {
339 return (input, Some(Token::DotDot));
340 } else {
341 return (&input[1..], Some(Token::Dot));
342 }
343 }
344 b':' => {
345 if let Some(input) = eat(input, ":>") {
346 return (input, Some(Token::Cast));
347 } else if let Some(input) = eat(input, "::") {
348 return (input, Some(Token::DoubleColon));
349 } else {
350 return (&input[1..], Some(Token::Colon));
351 }
352 }
353 b'<' => {
354 if let Some(input) = eat(input, "<=") {
355 return (input, Some(Token::LtEq));
356 } else if let Some(input) = eat(input, "<<") {
357 return (input, Some(Token::Shl));
358 } else {
359 return (&input[1..], Some(Token::Lt));
360 }
361 }
362 b'>' => {
363 if let Some(input) = eat(input, ">=") {
364 return (input, Some(Token::GtEq));
365 } else if let Some(input) = eat(input, ">>>") {
366 return (input, Some(Token::ShrSigned));
367 } else if let Some(input) = eat(input, ">>") {
368 return (input, Some(Token::Shr));
369 } else {
370 return (&input[1..], Some(Token::Gt));
371 }
372 }
373 b'-' => {
374 if let Some(input) = eat(input, "->") {
375 return (input, Some(Token::Arrow));
376 } else {
377 return (&input[1..], Some(Token::Minus));
378 }
379 }
380 b'!' => {
381 if let Some(input) = eat(input, "!=") {
382 return (input, Some(Token::Ne));
383 } else {
384 return (&input[1..], Some(Token::Not));
385 }
386 }
387 _ => (),
388 }
389 if let Some((ident, input)) = eat_identifier(input) {
390 let ident = String::from_utf8_lossy(ident).to_string();
391 return (
392 input,
393 Some(match &*ident {
394 "type" => Token::Type,
395 "as" => Token::As,
396 "import" => Token::Import,
397 "import_ffi" => Token::ImportFfi,
398 "i8" => Token::I8,
399 "i16" => Token::I16,
400 "i32" => Token::I32,
401 "i64" => Token::I64,
402 "i128" => Token::I128,
403 "u8" => Token::U8,
404 "u16" => Token::U16,
405 "u32" => Token::U32,
406 "u64" => Token::U64,
407 "u128" => Token::U128,
408 "transform" => Token::Transform,
409 "function" => Token::Function,
410 "const" => Token::Const,
411 "container" => Token::Container,
412 "f32" => Token::F32,
413 "f64" => Token::F64,
414 "enum" => Token::Enum,
415 "bitfield" => Token::Bitfield,
416 "bool" => Token::Bool,
417 "from" => Token::From,
418 "true" => Token::True,
419 "false" => Token::False,
420 _ => Token::Ident(ident),
421 }),
422 );
423 }
424
425 (input, None)
426 }
427}
428
429#[derive(Clone, Debug, Copy, Default, Serialize, Deserialize)]
430pub struct Span {
431 pub line_start: u64,
432 pub line_stop: u64,
433 pub col_start: u64,
434 pub col_stop: u64,
435}
436
437impl PartialEq for Span {
438 fn eq(&self, _other: &Span) -> bool {
439 true
440 }
441}
442
443impl std::hash::Hash for Span {
444 fn hash<H: std::hash::Hasher>(&self, _state: &mut H) {}
445}
446
447impl fmt::Display for Span {
448 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
449 if self.line_start == self.line_stop {
450 write!(
451 f,
452 "{}:{}-{}",
453 self.line_start, self.col_start, self.col_stop
454 )
455 } else {
456 write!(
457 f,
458 "{}:{}-{}:{}",
459 self.line_start, self.col_start, self.line_stop, self.col_stop
460 )
461 }
462 }
463}
464
465impl std::ops::Add for Span {
466 type Output = Self;
467
468 fn add(self, other: Self) -> Self {
469 if self.line_start == other.line_stop {
470 Span {
471 line_start: self.line_start,
472 line_stop: self.line_stop,
473 col_start: self.col_start.min(other.col_start),
474 col_stop: self.col_stop.max(other.col_stop),
475 }
476 } else if self.line_start < other.line_start {
477 Span {
478 line_start: self.line_start,
479 line_stop: other.line_stop,
480 col_start: self.col_start,
481 col_stop: other.col_stop,
482 }
483 } else {
484 Span {
485 line_start: other.line_start,
486 line_stop: self.line_stop,
487 col_start: other.col_start,
488 col_stop: self.col_stop,
489 }
490 }
491 }
492}
493
494#[derive(Clone)]
495pub struct SpannedToken {
496 pub token: Token,
497 pub span: Span,
498}
499
500impl fmt::Display for SpannedToken {
501 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
502 write!(f, "'{}' @ ", self.token.to_string().trim())?;
503 self.span.fmt(f)
504 }
505}
506
507impl fmt::Debug for SpannedToken {
508 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
509 <SpannedToken as fmt::Display>::fmt(self, f)
510 }
511}
512
513pub fn tokenize(input: &str, strip_comments: bool) -> Result<Vec<SpannedToken>> {
514 let mut input = input.as_bytes();
515 let mut tokens = vec![];
516 let mut index = 064;
517 let mut line_no = 1u64;
518 let mut line_start = 0u64;
519 while input.len() > 0 {
520 match Token::gobble(input) {
521 (output, Some(token)) => {
522 let start_line = line_no;
523 match &token {
524 Token::CommentLine(_) => {
525 line_no += 1;
526 },
527 Token::CommentBlock(s) => {
528 line_no += s.chars().filter(|x| *x == '\n').count() as u64;
529 }
530 _ => (),
531 }
532 tokens.push(SpannedToken {
533 token,
534 span: Span {
535 line_start: start_line,
536 line_stop: line_no,
537 col_start: index - line_start + 1,
538 col_stop: index - line_start + (input.len() - output.len()) as u64 + 1,
539 },
540 });
541 index += (input.len() - output.len()) as u64;
542 input = output;
543 }
544 (output, None) => {
545 if output.len() == 0 {
546 break;
547 } else if output.len() == input.len() {
548 return Err(protospec_err!(
549 "unexpected token '{}' @ {}",
550 String::from_utf8_lossy(&[input[0]]),
551 index
552 ));
553 }
554 index += (input.len() - output.len()) as u64;
555 if input[0] == b'\n' {
556 line_no += 1;
557 line_start = index;
558 }
559 input = output;
560 }
561 }
562 }
563 if strip_comments {
564 Ok(tokens.into_iter().filter(|x| !matches!(x.token, Token::CommentLine(_) | Token::CommentBlock(_))).collect())
565 } else {
566 Ok(tokens)
567 }
568}
569
570#[cfg(test)]
571mod tests {
572 use super::*;
573
574 #[test]
575 fn test_string() {
576 let tokens = tokenize(
577 r#""test" "test\"test""#,
578 false,
579 )
580 .unwrap();
581 let mut output = String::new();
582 for SpannedToken { token, .. } in tokens.iter() {
583 output += &token.to_string();
584 }
585 assert_eq!(
586 output,
587 r#""test""test"test""#
588 );
589
590 }
591
592 #[test]
593 fn test_tokenizer() {
594 let tokens = tokenize(
595 r#"
596 test_ident
597 "string"
598 "str\"ing"
599 "str\\ing"
600 12345
601 -12345
602 type
603 as
604 import
605 import_ffi
606 i8
607 u8
608 transform
609 function
610 const/*
611
612 test block*/container
613 f32
614 f64
615 enum
616 bitfield
617 true
618 false
619 bool
620 from
621 ,;:?[]{}<>?+-/ *%..<=>= = == != ! () // test$
622 :> || && | ^ | >> << >>>~ . ?:
623 //"#,
624 false,
625 )
626 .unwrap();
627 let mut output = String::new();
628 for SpannedToken { token, .. } in tokens.iter() {
629 output += &token.to_string();
630 }
631 assert_eq!(
632 output,
633 r#"test_ident"string""str"ing""str\ing"12345-12345type as import import_ffi i8 u8 transform function const /*
634
635 test block*/ container f32 f64 enum bitfield true false bool from ,;:?[]{}< > ?+-/ *%.. <= >= = == != ! ()// test$
636:> || && | ^| >> << >>> ~. ?: //
637"#
638 );
639 }
640}