protospec_build/
tokenizer.rs

1use crate::result::*;
2use serde::{Deserialize, Serialize};
3use std::fmt;
4
5#[derive(Clone, Debug, PartialEq)]
6pub enum Token {
7    Ident(String),
8    String(Vec<u8>),
9    Int(String),
10    CommentLine(String),
11    CommentBlock(String),
12    Type,
13    Equal,
14    As,
15    Import,
16    Comma,
17    From,
18    ImportFfi,
19    Transform,
20    Function,
21    Const,
22    DotDot,
23    Dot,
24    Elvis,
25    U8,
26    U16,
27    U32,
28    U64,
29    U128,
30    I8,
31    I16,
32    I32,
33    I64,
34    I128,
35    F32,
36    F64,
37    Bool,
38    Lt,
39    Gt,
40    Arrow,
41    Container,
42    LeftSquare,
43    RightSquare,
44    LeftCurly,
45    RightCurly,
46    Enum,
47    Bitfield,
48    LtEq,
49    GtEq,
50    Eq,
51    Ne,
52    Question,
53    Colon,
54    DoubleColon,
55    Semicolon,
56    Plus,
57    Minus,
58    Mul,
59    Div,
60    Mod,
61    Not,
62    LeftParen,
63    RightParen,
64    Cast,
65    Or,
66    And,
67    BitOr,
68    BitXor,
69    BitAnd,
70    Shr,
71    Shl,
72    ShrSigned,
73    BitNot,
74    True,
75    False,
76}
77
78impl fmt::Display for Token {
79    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
80        use Token::*;
81        match self {
82            Ident(s) => write!(f, "{}", s),
83            String(s) => write!(f, "\"{}\"", std::string::String::from_utf8_lossy(&s[..])), // todo escapes
84            Int(s) => write!(f, "{}", s),
85            CommentLine(s) => write!(f, "//{}\n", s),
86            CommentBlock(s) => write!(f, "/*{}*/ ", s),
87            Type => write!(f, "type "),
88            Equal => write!(f, "= "),
89            As => write!(f, "as "),
90            Import => write!(f, "import "),
91            Comma => write!(f, ","),
92            From => write!(f, "from "),
93            ImportFfi => write!(f, "import_ffi "),
94            Transform => write!(f, "transform "),
95            Function => write!(f, "function "),
96            Const => write!(f, "const "),
97            DotDot => write!(f, ".. "),
98            Dot => write!(f, ". "),
99            Elvis => write!(f, "?: "),
100            U8 => write!(f, "u8 "),
101            U16 => write!(f, "u16 "),
102            U32 => write!(f, "u32 "),
103            U64 => write!(f, "u64 "),
104            U128 => write!(f, "u128 "),
105            I8 => write!(f, "i8 "),
106            I16 => write!(f, "i16 "),
107            I32 => write!(f, "i32 "),
108            I64 => write!(f, "i64 "),
109            I128 => write!(f, "i128 "),
110            F32 => write!(f, "f32 "),
111            F64 => write!(f, "f64 "),
112            Bool => write!(f, "bool "),
113            Lt => write!(f, "< "),
114            Gt => write!(f, "> "),
115            Arrow => write!(f, "-> "),
116            Container => write!(f, "container "),
117            LeftSquare => write!(f, "["),
118            RightSquare => write!(f, "]"),
119            LeftCurly => write!(f, "{{"),
120            RightCurly => write!(f, "}}"),
121            Enum => write!(f, "enum "),
122            Bitfield => write!(f, "bitfield "),
123            LtEq => write!(f, "<= "),
124            GtEq => write!(f, ">= "),
125            Eq => write!(f, "== "),
126            Ne => write!(f, "!= "),
127            Question => write!(f, "?"),
128            Colon => write!(f, ":"),
129            DoubleColon => write!(f, "::"),
130            Semicolon => write!(f, ";"),
131            Plus => write!(f, "+"),
132            Minus => write!(f, "-"),
133            Mul => write!(f, "*"),
134            Div => write!(f, "/ "),
135            Mod => write!(f, "%"),
136            Not => write!(f, "! "),
137            LeftParen => write!(f, "("),
138            RightParen => write!(f, ")"),
139            Cast => write!(f, ":> "),
140            Or => write!(f, "|| "),
141            And => write!(f, "&& "),
142            BitOr => write!(f, "| "),
143            BitXor => write!(f, "^"),
144            BitAnd => write!(f, "& "),
145            Shr => write!(f, ">> "),
146            Shl => write!(f, "<< "),
147            ShrSigned => write!(f, ">>> "),
148            BitNot => write!(f, "~"),
149            True => write!(f, "true "),
150            False => write!(f, "false "),
151        }
152    }
153}
154
155fn eat<'a>(input: &'a [u8], wanted: &str) -> Option<&'a [u8]> {
156    let wanted = wanted.as_bytes();
157    if input.len() < wanted.len() {
158        return None;
159    }
160    if &input[0..wanted.len()] == wanted {
161        return Some(&input[wanted.len()..]);
162    }
163    None
164}
165
166fn eat_identifier(input: &[u8]) -> Option<(&[u8], &[u8])> {
167    if input.len() == 0 {
168        return None;
169    }
170    if !input[0].is_ascii_alphabetic() && input[0] != b'_' {
171        return None;
172    }
173    let mut i = 1usize;
174    while i < input.len() {
175        if !input[i].is_ascii_alphanumeric() && input[i] != b'_' {
176            break;
177        }
178        i += 1;
179    }
180    Some((&input[0..i], &input[i..]))
181}
182
183impl Token {
184    fn gobble(input: &[u8]) -> (&[u8], Option<Token>) {
185        if input.len() == 0 {
186            return (input, None);
187        }
188        match input[0] {
189            x if x.is_ascii_whitespace() => return (&input[1..], None),
190            b'"' => {
191                let mut i = 1;
192                let mut out = vec![];
193                while i < input.len() {
194                    if input[i] == b'\\' && i < input.len() - 1 {
195                        i += 1;
196                        if input[i].is_ascii_hexdigit() {
197                            if i < input.len() - 1 && input[i + 1].is_ascii_hexdigit() {
198                                i += 1;
199                                out.push(
200                                    u8::from_str_radix(
201                                        std::str::from_utf8(&input[i..i + 2]).unwrap(),
202                                        16,
203                                    )
204                                    .unwrap(),
205                                );
206                            } else {
207                                out.push(
208                                    u8::from_str_radix(
209                                        std::str::from_utf8(&input[i..i + 1]).unwrap(),
210                                        16,
211                                    )
212                                    .unwrap(),
213                                );
214                            }
215                        } else {
216                            out.push(input[i]);
217                        }
218                        i += 1;
219                        if i == input.len() {
220                            return (input, None);
221                        }
222                        continue;
223                    } else if input[i] == b'"' {
224                        break;
225                    }
226                    out.push(input[i]);
227                    i += 1;
228                }
229                if i == input.len() {
230                    return (input, None);
231                }
232                return (&input[(i + 1)..], Some(Token::String(out)));
233            }
234            x if x.is_ascii_digit() => {
235                let mut i = 1;
236                let mut is_hex = false;
237                while i < input.len() {
238                    if i == 1 && input[0] == b'0' && input[i] == b'x' {
239                        is_hex = true;
240                        i += 1;
241                        continue;
242                    }
243                    if is_hex {
244                        if !input[i].is_ascii_hexdigit() {
245                            break;
246                        }
247                    } else {
248                        if !input[i].is_ascii_digit() {
249                            break;
250                        }
251                    }
252
253                    i += 1;
254                }
255                return (
256                    &input[i..],
257                    Some(Token::Int(
258                        String::from_utf8(input[0..i].to_vec()).unwrap_or_default(),
259                    )),
260                );
261            }
262            b'=' => {
263                if let Some(input) = eat(input, "==") {
264                    return (input, Some(Token::Eq));
265                } else {
266                    return (&input[1..], Some(Token::Equal));
267                }
268            }
269            b',' => return (&input[1..], Some(Token::Comma)),
270            b';' => return (&input[1..], Some(Token::Semicolon)),
271            b'?' => {
272                if let Some(input) = eat(input, "?:") {
273                    return (input, Some(Token::Elvis));
274                } else {
275                    return (&input[1..], Some(Token::Question));
276                }
277            }
278            b'[' => return (&input[1..], Some(Token::LeftSquare)),
279            b']' => return (&input[1..], Some(Token::RightSquare)),
280            b'{' => return (&input[1..], Some(Token::LeftCurly)),
281            b'}' => return (&input[1..], Some(Token::RightCurly)),
282            b'(' => return (&input[1..], Some(Token::LeftParen)),
283            b')' => return (&input[1..], Some(Token::RightParen)),
284            b'+' => return (&input[1..], Some(Token::Plus)),
285            b'*' => return (&input[1..], Some(Token::Mul)),
286            b'%' => return (&input[1..], Some(Token::Mod)),
287            b'^' => return (&input[1..], Some(Token::BitXor)),
288            b'~' => return (&input[1..], Some(Token::BitNot)),
289            b'|' => {
290                if let Some(input) = eat(input, "||") {
291                    return (input, Some(Token::Or));
292                } else {
293                    return (&input[1..], Some(Token::BitOr));
294                }
295            }
296            b'/' => {
297                if let Some(input) = eat(input, "//") {
298                    let eol = input.iter().position(|x| *x == b'\n');
299                    let (input, comment) = if let Some(eol) = eol {
300                        (&input[(eol + 1)..], &input[..eol])
301                    } else {
302                        (&input[input.len()..input.len()], &input[..])
303                    };
304                    return (
305                        input,
306                        Some(Token::CommentLine(
307                            String::from_utf8_lossy(comment).to_string(),
308                        )),
309                    );
310                } else if let Some(input) = eat(input, "/*") {
311                    if input.len() == 0 {
312                        return (input, None);
313                    }
314                    let eol = input.windows(2).position(|x| x[0] == b'*' && x[1] == b'/');
315                    let (input, comment) = if let Some(eol) = eol {
316                        (&input[(eol + 2)..], &input[..eol])
317                    } else {
318                        (&input[input.len()..input.len()], &input[..])
319                    };
320                    return (
321                        input,
322                        Some(Token::CommentBlock(
323                            String::from_utf8_lossy(comment).to_string(),
324                        )),
325                    );
326                } else {
327                    return (&input[1..], Some(Token::Div));
328                }
329            }
330            b'&' => {
331                if let Some(input) = eat(input, "&&") {
332                    return (input, Some(Token::And));
333                } else {
334                    return (&input[1..], Some(Token::BitAnd));
335                }
336            }
337            b'.' => {
338                if let Some(input) = eat(input, "..") {
339                    return (input, Some(Token::DotDot));
340                } else {
341                    return (&input[1..], Some(Token::Dot));
342                }
343            }
344            b':' => {
345                if let Some(input) = eat(input, ":>") {
346                    return (input, Some(Token::Cast));
347                } else if let Some(input) = eat(input, "::") {
348                    return (input, Some(Token::DoubleColon));
349                } else {
350                    return (&input[1..], Some(Token::Colon));
351                }
352            }
353            b'<' => {
354                if let Some(input) = eat(input, "<=") {
355                    return (input, Some(Token::LtEq));
356                } else if let Some(input) = eat(input, "<<") {
357                    return (input, Some(Token::Shl));
358                } else {
359                    return (&input[1..], Some(Token::Lt));
360                }
361            }
362            b'>' => {
363                if let Some(input) = eat(input, ">=") {
364                    return (input, Some(Token::GtEq));
365                } else if let Some(input) = eat(input, ">>>") {
366                    return (input, Some(Token::ShrSigned));
367                } else if let Some(input) = eat(input, ">>") {
368                    return (input, Some(Token::Shr));
369                } else {
370                    return (&input[1..], Some(Token::Gt));
371                }
372            }
373            b'-' => {
374                if let Some(input) = eat(input, "->") {
375                    return (input, Some(Token::Arrow));
376                } else {
377                    return (&input[1..], Some(Token::Minus));
378                }
379            }
380            b'!' => {
381                if let Some(input) = eat(input, "!=") {
382                    return (input, Some(Token::Ne));
383                } else {
384                    return (&input[1..], Some(Token::Not));
385                }
386            }
387            _ => (),
388        }
389        if let Some((ident, input)) = eat_identifier(input) {
390            let ident = String::from_utf8_lossy(ident).to_string();
391            return (
392                input,
393                Some(match &*ident {
394                    "type" => Token::Type,
395                    "as" => Token::As,
396                    "import" => Token::Import,
397                    "import_ffi" => Token::ImportFfi,
398                    "i8" => Token::I8,
399                    "i16" => Token::I16,
400                    "i32" => Token::I32,
401                    "i64" => Token::I64,
402                    "i128" => Token::I128,
403                    "u8" => Token::U8,
404                    "u16" => Token::U16,
405                    "u32" => Token::U32,
406                    "u64" => Token::U64,
407                    "u128" => Token::U128,
408                    "transform" => Token::Transform,
409                    "function" => Token::Function,
410                    "const" => Token::Const,
411                    "container" => Token::Container,
412                    "f32" => Token::F32,
413                    "f64" => Token::F64,
414                    "enum" => Token::Enum,
415                    "bitfield" => Token::Bitfield,
416                    "bool" => Token::Bool,
417                    "from" => Token::From,
418                    "true" => Token::True,
419                    "false" => Token::False,
420                    _ => Token::Ident(ident),
421                }),
422            );
423        }
424
425        (input, None)
426    }
427}
428
429#[derive(Clone, Debug, Copy, Default, Serialize, Deserialize)]
430pub struct Span {
431    pub line_start: u64,
432    pub line_stop: u64,
433    pub col_start: u64,
434    pub col_stop: u64,
435}
436
437impl PartialEq for Span {
438    fn eq(&self, _other: &Span) -> bool {
439        true
440    }
441}
442
443impl std::hash::Hash for Span {
444    fn hash<H: std::hash::Hasher>(&self, _state: &mut H) {}
445}
446
447impl fmt::Display for Span {
448    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
449        if self.line_start == self.line_stop {
450            write!(
451                f,
452                "{}:{}-{}",
453                self.line_start, self.col_start, self.col_stop
454            )
455        } else {
456            write!(
457                f,
458                "{}:{}-{}:{}",
459                self.line_start, self.col_start, self.line_stop, self.col_stop
460            )
461        }
462    }
463}
464
465impl std::ops::Add for Span {
466    type Output = Self;
467
468    fn add(self, other: Self) -> Self {
469        if self.line_start == other.line_stop {
470            Span {
471                line_start: self.line_start,
472                line_stop: self.line_stop,
473                col_start: self.col_start.min(other.col_start),
474                col_stop: self.col_stop.max(other.col_stop),
475            }
476        } else if self.line_start < other.line_start {
477            Span {
478                line_start: self.line_start,
479                line_stop: other.line_stop,
480                col_start: self.col_start,
481                col_stop: other.col_stop,
482            }
483        } else {
484            Span {
485                line_start: other.line_start,
486                line_stop: self.line_stop,
487                col_start: other.col_start,
488                col_stop: self.col_stop,
489            }
490        }
491    }
492}
493
494#[derive(Clone)]
495pub struct SpannedToken {
496    pub token: Token,
497    pub span: Span,
498}
499
500impl fmt::Display for SpannedToken {
501    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
502        write!(f, "'{}' @ ", self.token.to_string().trim())?;
503        self.span.fmt(f)
504    }
505}
506
507impl fmt::Debug for SpannedToken {
508    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
509        <SpannedToken as fmt::Display>::fmt(self, f)
510    }
511}
512
513pub fn tokenize(input: &str, strip_comments: bool) -> Result<Vec<SpannedToken>> {
514    let mut input = input.as_bytes();
515    let mut tokens = vec![];
516    let mut index = 064;
517    let mut line_no = 1u64;
518    let mut line_start = 0u64;
519    while input.len() > 0 {
520        match Token::gobble(input) {
521            (output, Some(token)) => {
522                let start_line = line_no;
523                match &token {
524                    Token::CommentLine(_) => {
525                        line_no += 1;
526                    },
527                    Token::CommentBlock(s) => {
528                        line_no += s.chars().filter(|x| *x == '\n').count() as u64;
529                    }
530                    _ => (),
531                }
532                tokens.push(SpannedToken {
533                    token,
534                    span: Span {
535                        line_start: start_line,
536                        line_stop: line_no,
537                        col_start: index - line_start + 1,
538                        col_stop: index - line_start + (input.len() - output.len()) as u64 + 1,
539                    },
540                });
541                index += (input.len() - output.len()) as u64;
542                input = output;
543            }
544            (output, None) => {
545                if output.len() == 0 {
546                    break;
547                } else if output.len() == input.len() {
548                    return Err(protospec_err!(
549                        "unexpected token '{}' @ {}",
550                        String::from_utf8_lossy(&[input[0]]),
551                        index
552                    ));
553                }
554                index += (input.len() - output.len()) as u64;
555                if input[0] == b'\n' {
556                    line_no += 1;
557                    line_start = index;
558                }
559                input = output;
560            }
561        }
562    }
563    if strip_comments {
564        Ok(tokens.into_iter().filter(|x| !matches!(x.token, Token::CommentLine(_) | Token::CommentBlock(_))).collect())
565    } else {
566        Ok(tokens)
567    }
568}
569
570#[cfg(test)]
571mod tests {
572    use super::*;
573
574    #[test]
575    fn test_string() {
576        let tokens = tokenize(
577            r#""test" "test\"test""#,
578            false,
579        )
580        .unwrap();
581        let mut output = String::new();
582        for SpannedToken { token, .. } in tokens.iter() {
583            output += &token.to_string();
584        }
585        assert_eq!(
586            output,
587            r#""test""test"test""#
588        );
589
590    }
591
592    #[test]
593    fn test_tokenizer() {
594        let tokens = tokenize(
595            r#"
596        test_ident
597        "string"
598        "str\"ing"
599        "str\\ing"
600        12345
601        -12345
602        type
603        as
604        import
605        import_ffi
606        i8
607        u8
608        transform
609        function
610        const/*
611
612        test block*/container
613        f32
614        f64
615        enum
616        bitfield
617        true
618        false
619        bool
620        from
621        ,;:?[]{}<>?+-/ *%..<=>= = == != ! () // test$
622        :> || && | ^ | >> << >>>~ . ?:
623        //"#,
624        false,
625        )
626        .unwrap();
627        let mut output = String::new();
628        for SpannedToken { token, .. } in tokens.iter() {
629            output += &token.to_string();
630        }
631        assert_eq!(
632            output,
633            r#"test_ident"string""str"ing""str\ing"12345-12345type as import import_ffi i8 u8 transform function const /*
634
635        test block*/ container f32 f64 enum bitfield true false bool from ,;:?[]{}< > ?+-/ *%.. <= >= = == != ! ()// test$
636:> || && | ^| >> << >>> ~. ?: //
637"#
638        );
639    }
640}