moore_svlog_syntax/
cat.rs

1// Copyright (c) 2016-2021 Fabian Schuiki
2
3//! The categorizing lexer. Tokenizes an input stream of characters, yielding a
4//! stream of newline, whitespace, comment, symbol, and text tokens.
5//!
6//! # Example
7//! ```
8//! extern crate moore_svlog_syntax;
9//! let input = "Löwe 老虎 Léopard\n";
10//! let mut cat = moore_svlog_syntax::cat::Cat::new(Box::new(input.char_indices()));
11//! let tokens: Vec<_> = cat.collect();
12//! ```
13
14pub use self::CatTokenKind::*;
15use moore_common::source::*;
16
17/// The categorizing lexer. Divides an input stream of characters (unicode) into
18/// coarse groups of tokens. These include whitespace, comments, symbols, and
19/// text. The strings contained in the emitted tokens can be concatenated to
20/// arrive at the original file, i.e. no information is lost.
21pub struct Cat<'a> {
22    iter: Box<CharIter<'a>>,
23    last: usize,
24    chars: (Option<char>, Option<char>),
25    indices: (usize, usize),
26}
27
28impl<'a> Cat<'a> {
29    /// Create a new categorizing lexer from an `CharIter` iterator.
30    pub fn new(mut iter: Box<CharIter<'a>>) -> Cat<'a> {
31        let last = iter
32            .size_hint()
33            .1
34            .expect("Iterator must provide upper bounds");
35        let c0 = iter.next();
36        let c1 = iter.next();
37        Cat {
38            iter: iter,
39            last: last,
40            chars: (c0.map(|x| x.1), c1.map(|x| x.1)),
41            indices: (
42                c0.map(|x| x.0).unwrap_or(last),
43                c1.map(|x| x.0).unwrap_or(last),
44            ),
45        }
46    }
47
48    /// Advance to the next character in the input stream.
49    fn bump(&mut self) {
50        let c = self.iter.next();
51        self.chars = (self.chars.1, c.map(|x| x.1));
52        self.indices = (self.indices.1, c.map(|x| x.0).unwrap_or(self.last));
53    }
54}
55
56impl<'a> Iterator for Cat<'a> {
57    type Item = CatToken;
58
59    fn next(&mut self) -> Option<Self::Item> {
60        match self.chars {
61            (None, _) => None,
62
63            // Newlines
64            (Some('\n'), _) => {
65                let t = CatToken(Newline, self.indices.0, self.indices.1);
66                self.bump();
67                Some(t)
68            }
69
70            // Whitespace characters
71            (Some(c), _) if is_whitespace(c) => {
72                let p0 = self.indices.0;
73                while let (Some(c), _) = self.chars {
74                    if !is_whitespace(c) {
75                        break;
76                    }
77                    self.bump();
78                }
79                Some(CatToken(Whitespace, p0, self.indices.0))
80            }
81
82            // IEEE 1800-2009 5.4 Comments
83            // Consume single-line comments initiated by "//".
84            (Some('/'), Some('/')) => {
85                let p0 = self.indices.0;
86                while let (Some(c), _) = self.chars {
87                    if c == '\n' {
88                        break;
89                    }
90                    self.bump();
91                }
92                Some(CatToken(Comment, p0, self.indices.0))
93            }
94
95            // Consume multi-line comments inititated by "/*".
96            (Some('/'), Some('*')) => {
97                let p0 = self.indices.0;
98                while let (Some(c0), Some(c1)) = self.chars {
99                    if c0 == '*' && c1 == '/' {
100                        self.bump();
101                        self.bump();
102                        break;
103                    }
104                    self.bump();
105                }
106                Some(CatToken(Comment, p0, self.indices.0))
107            }
108
109            // Consume symbols.
110            // IEEE 1800-2009 5.5 Operators & 11.3 Operators
111            (Some(c), _) if is_symbol(c) => {
112                let t = CatToken(Symbol(c), self.indices.0, self.indices.1);
113                self.bump();
114                Some(t)
115            }
116
117            // Consume digits.
118            (Some(c), _) if is_digit(c) => {
119                let p0 = self.indices.0;
120                while let (Some(c), _) = self.chars {
121                    if !is_digit(c) {
122                        break;
123                    }
124                    self.bump();
125                }
126                Some(CatToken(Digits, p0, self.indices.0))
127            }
128
129            // Consume text.
130            (Some(_), _) => {
131                let p0 = self.indices.0;
132                while let (Some(c), _) = self.chars {
133                    if c == '\n' || is_whitespace(c) || is_symbol(c) {
134                        break;
135                    }
136                    self.bump();
137                }
138                Some(CatToken(Text, p0, self.indices.0))
139            }
140        }
141    }
142}
143
144/// Check whether the given character is considered a whitespace in
145/// SystemVerilog.
146fn is_whitespace(c: char) -> bool {
147    c == ' ' || c == '\t' || c == '\r' || c == (0xA0 as char)
148}
149
150/// Check whether the given character is a digit.
151fn is_digit(c: char) -> bool {
152    c >= '0' && c <= '9'
153}
154
155/// Check whether the given character is considered a symbol in SystemVerilog.
156fn is_symbol(c: char) -> bool {
157    match c {
158        '(' | ')' | '[' | ']' | '{' | '}' | '#' | ':' | ';' | '.' | ',' | '=' | '+' | '-' | '*'
159        | '/' | '~' | '|' | '<' | '>' | '!' | '%' | '^' | '&' | '?' | '\'' | '"' | '`' | '$'
160        | '\\' | '@' => true,
161        //'_' => true,
162        _ => false,
163    }
164}
165
166/// A token emitted by the categorizing lexer.
167#[derive(Clone, Copy, PartialEq, Eq, Debug)]
168pub struct CatToken(pub CatTokenKind, pub usize, pub usize);
169
170/// The different kinds of tokens the categorizing lexer can emit.
171#[derive(Clone, Copy, PartialEq, Eq, Debug)]
172pub enum CatTokenKind {
173    Newline,
174    Whitespace,
175    Comment,
176    Symbol(char),
177    Text,
178    Digits,
179    Eof,
180}
181
182#[cfg(test)]
183mod tests {
184    use super::*;
185
186    fn lex(input: &str) -> Vec<CatToken> {
187        Cat::new(Box::new(input.char_indices())).collect()
188    }
189
190    #[test]
191    fn empty() {
192        assert_eq!(lex(""), vec![]);
193    }
194
195    #[test]
196    fn non_empty() {
197        assert_eq!(
198            lex("Löwe 老虎 1234Léopard\n"),
199            vec![
200                CatToken(Text, 0, 5),
201                CatToken(Whitespace, 5, 6),
202                CatToken(Text, 6, 12),
203                CatToken(Whitespace, 12, 13),
204                CatToken(Digits, 13, 17),
205                CatToken(Text, 17, 25),
206                CatToken(Newline, 25, 26),
207            ]
208        );
209    }
210}