cfront_definition_lexer/
lib.rs

1use cfront_definition::{token::{Token, TokenType}, Keyword};
2use cfront_definition_keyword::automaton;
3
4pub use cfront_definition_keyword::keyword as keyword; 
5pub use cfront_definition::token as token; 
6
7pub fn analyze(input: &str) -> Vec<Token<'_>> {
8    let char_indices: Vec<_> = input.char_indices().collect(); 
9    let char_indices = &char_indices[..]; 
10    let mut idx = 0; 
11    let (mut last_line, mut last_column) = (0, 0); 
12    let (mut line, mut column) = (0, 0); 
13    let mut just_ignore: usize = 0; 
14    enum CommentState {
15        None, 
16        Line, 
17        Block,
18    }
19    let mut comment_state = CommentState::None; 
20    enum QuotingState {
21        None, 
22        Single(usize),
23        Double(usize),
24    }
25    let mut quoting_state = QuotingState::None; 
26    let mut lidx: Option<usize> = None; 
27    enum NormalState<'a> {
28        Identifier, 
29        Number(Option<&'a str>), 
30    }
31    let mut normal_state = NormalState::Identifier; 
32    let mut ans = Vec::new(); 
33    while let Some((i, c)) = char_indices.get(idx) { 
34        'scope : {
35            match just_ignore {
36                ref mut x if *x > 0 => {
37                    *x -= 1; 
38                    break 'scope; 
39                } 
40                _ => (), 
41            }
42            match quoting_state {
43                QuotingState::None => {
44                    if *c == '\'' {
45                        quoting_state = QuotingState::Single(*i); 
46                        break 'scope; 
47                    } else if *c == '"' {
48                        quoting_state = QuotingState::Double(*i); 
49                        break 'scope; 
50                    } 
51                }
52                QuotingState::Single(l) => {
53                    if *c == '\'' {
54                        ans.push(Token { token_type: TokenType::CharLiteral(&input[l+1..*i], true), line, column, }); 
55                        quoting_state = QuotingState::None; 
56                    } else if *c == '\n' {
57                        ans.push(Token { token_type: TokenType::CharLiteral(&input[l+1..*i], false), line, column, }); 
58                        quoting_state = QuotingState::None; 
59                    }
60                    break 'scope; 
61                }
62                QuotingState::Double(l) => {
63                    if *c == '"' {
64                        ans.push(Token { token_type: TokenType::StringLiteral(&input[l+1..*i], true), line, column, }); 
65                        quoting_state = QuotingState::None; 
66                    } else if *c == '\n' {
67                        ans.push(Token { token_type: TokenType::StringLiteral(&input[l+1..*i], false), line, column, }); 
68                        quoting_state = QuotingState::None; 
69                    } 
70                    break 'scope; 
71                }
72            }
73            match comment_state {
74                CommentState::None => {
75                    if *c == '/' {
76                        if let Some((_, c2)) = char_indices.get(idx + 1) {
77                            if *c2 == '/' {
78                                comment_state = CommentState::Line; 
79                                just_ignore = 1; 
80                                break 'scope; 
81                            } else if *c2 == '*' {
82                                comment_state = CommentState::Block; 
83                                just_ignore = 1; 
84                                break 'scope; 
85                            }
86                        } 
87                    }
88                }
89                CommentState::Line => {
90                    if *c == '\n' {
91                        comment_state = CommentState::None; 
92                    }
93                    break 'scope;  
94                }
95                CommentState::Block => {
96                    if *c == '*' {
97                        if let Some((_, c2)) = char_indices.get(idx + 1) {
98                            if *c2 == '/' {
99                                comment_state = CommentState::None; 
100                                just_ignore = 1; 
101                                break 'scope; 
102                            }
103                        } 
104                    } 
105                    break 'scope;
106                }
107            } 
108            let mut punt = false; 
109            match lidx {
110                Some(l) => {
111                    let mut should_put = false; 
112                    if c.is_ascii_whitespace() {
113                        should_put = true; 
114                    }
115                    if c.is_ascii_punctuation() {
116                        let mut i = true; 
117                        if *c == '.' { 
118                            if let NormalState::Number(_) = normal_state {
119                                i = false; 
120                            } 
121                        }
122                        if *c == '$' || *c == '@' || *c == '_' { 
123                            i = false; 
124                        }
125                        if i {
126                            should_put = true; 
127                            punt = true; 
128                        }
129                    } 
130                    if should_put {
131                        match normal_state {
132                            NormalState::Identifier => {
133                                let s = &input[l..*i]; 
134                                if let Some(k) = try_into_keyword(s) {
135                                    ans.push(Token { token_type: TokenType::Keyword(k), line: last_line, column: last_column, }); 
136                                } else {
137                                    ans.push(Token { token_type: TokenType::Identifier(s), line: last_line, column: last_column, }); 
138                                } 
139                            }
140                            NormalState::Number(prefix) => {
141                                let s = &input[l..*i]; 
142                                ans.push(Token { token_type: TokenType::NumberLiteral(s, prefix), line: last_line, column: last_column }); 
143                            }
144                        }
145                        lidx = None;
146                    }
147                },
148                None => {
149                    if c.is_whitespace() {
150                        break 'scope;  
151                    }
152                    if c.is_ascii_punctuation() {
153                        punt = true; 
154                        if *c == '$' || *c == '@' || *c == '_' { 
155                            punt = false;
156                        }
157                    } 
158                    if !punt {
159                        lidx = Some(*i); 
160                        if c.is_digit(10) {
161                            normal_state = NormalState::Number(None); 
162                            if *c == '0' {
163                                let p = char_indices.get(idx + 1); 
164                                match p {
165                                    Some((_, 'x')) | Some((_, 'X')) | Some((_, 'b')) | Some((_, 'B')) => {
166                                        normal_state = NormalState::Number(Some(&input[*i..i+2])); 
167                                        just_ignore = 1; 
168                                        break 'scope; 
169                                    }
170                                    _ => {
171                                        normal_state = NormalState::Number(Some(&input[*i..i+1])); 
172                                    }
173                                } 
174                            }
175                        } else {
176                            normal_state = NormalState::Identifier; 
177                        }
178                    }
179                },
180            }
181            if punt {
182                use TokenType::*; 
183                match c {
184                    '(' => {
185                        ans.push(Token { token_type: Parenthesis { is_left: true }, line, column, }); 
186                    }
187                    ')' => {
188                        ans.push(Token { token_type: Parenthesis { is_left: false }, line, column, }); 
189                    } 
190                    '[' => {
191                        ans.push(Token { token_type: Bracket { is_left: true }, line, column, }); 
192                    } 
193                    ']' => {
194                        ans.push(Token { token_type: Bracket { is_left: false }, line, column, }); 
195                    } 
196                    '{' => {
197                        ans.push(Token { token_type: Brace { is_left: true }, line, column, }); 
198                    } 
199                    '}' => {
200                        ans.push(Token { token_type: Brace { is_left: false }, line, column, }); 
201                    } 
202                    | '.' | ',' | ';' | '~' | ':' => {
203                        ans.push(Token { token_type: Operator(&input[*i..i+1]), line, column, });  
204                    } 
205                    // every puntc here can be followed by '=' 
206                    | '<' | '=' | '>' | '+' | '-' | '*' | '/' | '%' | '&' | '^' | '|' | '!' => {
207                        if *c == '<' {
208                            let p = (char_indices.get(idx + 1), char_indices.get(idx + 2)); 
209                            match p {
210                                (Some((_, '<')), Some((_, '='))) => {
211                                    ans.push(Token { token_type: Operator(&input[*i..i+3]), line, column: column + 2 });
212                                    just_ignore = 2; 
213                                    break 'scope; 
214                                }
215                                _ => (), 
216                            }
217                        }        
218                        if *c == '>' {
219                            let p = (char_indices.get(idx + 1), char_indices.get(idx + 2)); 
220                            match p {
221                                (Some((_, '>')), Some((_, '='))) => {
222                                    ans.push(Token { token_type: Operator(&input[*i..i+3]), line, column: column + 2 });
223                                    just_ignore = 2; 
224                                    break 'scope; 
225                                }
226                                _ => (), 
227                            } 
228                        }
229                        let p = char_indices.get(idx + 1);
230                        match p {
231                            Some((_, '=')) => {
232                                ans.push(Token { token_type: Operator(&input[*i..i+2]), line, column: column + 1 });
233                                just_ignore = 1; 
234                                break 'scope;  
235                            }
236                            _ => (), 
237                        }
238                        if *c == '+' || *c == '-' || *c == '&' || *c == '|' {
239                            match p {
240                                Some((_, b)) if *b == *c => {
241                                    ans.push(Token { token_type: Operator(&input[*i..i+2]), line, column: column + 1});
242                                    just_ignore = 1; 
243                                    break 'scope;  
244                                }
245                                _ => (),  
246                            }
247                        }
248                        if *c == '-' {
249                            match p {
250                                Some((_, '>')) => {
251                                    ans.push(Token { token_type: Operator(&input[*i..i+2]), line, column: column + 1});
252                                    just_ignore = 1; 
253                                    break 'scope;  
254                                }
255                                _ => (),  
256                            }
257                        } 
258                        ans.push(Token { token_type: Operator(&input[*i..i+1]), line, column }); 
259                    }
260                    _ => {
261                        ans.push(Token { token_type: Operator(&input[*i..i+1]), line, column }); 
262                    }
263                }
264            }
265        }
266        (last_line, last_column) = (line, column); 
267        if *c == '\n' {
268            line += 1; 
269            column = 0; 
270        } else {
271            column += 1; 
272        } 
273        idx += 1; 
274    }
275    return ans; 
276}
277
278#[deprecated]
279pub fn try_into_keyword_directly(input: &str) -> Option<Keyword> {
280    use Keyword::*; 
281    let ans = match input {
282        "alignas" => AlignAs, 
283        "alignof" => AlignOf, 
284        "auto" => Auto, 
285        "bool" => Bool, 
286        "break" => Break, 
287        "case" => Case, 
288        "char" => Char, 
289        "const" => Const, 
290        "constexpr" => Constexpr, 
291        "continue" => Continue, 
292        "default" => Default, 
293        "do" => Do, 
294        "double" => Double, 
295        "else" => Else, 
296        "enum" => Enum, 
297        "extern" => Extern, 
298        "false" => False, 
299        "float" => Float, 
300        "for" => For, 
301        "goto" => Goto, 
302        "if" => If, 
303        "inline" => Inline, 
304        "int" => Int, 
305        "long" => Long, 
306        "nullptr" => Nullptr, 
307        "register" => Register, 
308        "restrict" => Restrict, 
309        "return" => Return, 
310        "short" => Short, 
311        "signed" => Signed, 
312        "sizeof" => Sizeof, 
313        "static" => Static, 
314        "static_assert" => StaticAssert, 
315        "struct" => Struct, 
316        "switch" => Switch, 
317        "thread_local" => ThreadLocal, 
318        "true" => True, 
319        "typedef" => Typedef, 
320        "typeof" => TypeOf, 
321        "typeof_unqual" => TypeOfUnqual, 
322        "union" => Union, 
323        "unsigned" => Unsigned, 
324        "void" => Void, 
325        "volatile" => Volatile, 
326        "while" => While, 
327        "_Alignas" => _AlignAs, 
328        "_Alignof" => _AlignOf, 
329        "_Atomic" => _Atomic, 
330        "_Bool" => _Bool, 
331        "_Complex" => _Complex, 
332        "_Decimal128" => _Decimal128, 
333        "_Decimal32" => _Decimal32, 
334        "_Decimal64" => _Decimal64, 
335        "_Generic" => _Generic, 
336        "_Imaginary" => _Imaginary, 
337        "_Noreturn" => _Noreturn, 
338        "_Static_assert" => _StaticAssert, 
339        "_Thread_local" => _ThreadLocal, 
340        "asm" => Asm, 
341        _ => return None, 
342    }; 
343    Some(ans) 
344}
345
346pub fn try_into_keyword_automaton(input: &str) -> Option<Keyword> {
347    let input = input.chars();
348    let mut state = automaton::State::default(); 
349    let mut rst = None; 
350    for i in input {
351        let s = state.read(i);  
352        match s {
353            Ok((s, k)) => {
354                state = s; 
355                rst = k;  
356            }
357            Err(_) => return None,
358        }
359    }
360    return rst; 
361}
362
363pub fn try_into_keyword(input: &str) -> Option<Keyword> {
364    try_into_keyword_automaton(input)
365}