1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
//! Lexer Module.

/// Module of functions for checking characters.
pub mod char_tests;
use lexer::char_tests::*;

use position::Position;
/// Module for defining, tracking, and printing lexer related errors.
pub mod error;
use lexer::error::LexerError;


use std::collections::HashSet;
#[derive(Debug, Clone)]
/// Lexer struct, which stores publicly a `tokens` field
/// which is generated using the `lex` method.  Tokens will be an internal
/// representation of source code, sliced in to parsable "lexemes" or "tokens".
pub struct Lexer {
    // source doesn't need pub right?
    source: String,
    pub tokens: Vec<String>,
}

impl Lexer {
    /// Constant containing all Strings that can represent any symbol or operator.
    /// Length of every symbol is 2 characters at most.
    /// Not all symbols currently have functionality in the wright language.
    pub const SYMBOLS: [&'static str; 55] = [
        "!", "~", "^", "=",
        "&", "&&", "|", "||",
        "+", "+=", "++",
        "-", "-=", "--",
        "*", "*=",
        "/", "/=",
        "%", "%=",
        "//", "/*", "*/",
        "/!","/?","?/", // doc comments
        ":", "::", "->", ".",
        "..",
        ";",
        "(", ")", "[", "]", "{", "}",
        "=>",
        "@", "#", "?", "$",
        "?!", // for compiler builtin checks
        "==", "!=", ">", "<", ">=", "<=",
        ">>", "<<",
        "\"", "'",
        "`",
    ];
    /// Constructor.
    /// Content argument is source code written in wright.
    pub fn new(content: String) -> Self {
        Lexer {
            source: content,
            tokens: vec![],
        }
    }
    /// Tokenizes `self.source` and stores to `self.tokens`.
    /// #### Is completely loss-less.
    /// No source-code is lost in this conversion, it's all just split into parsable tokens.
    /// Note that this lexing follows the rules of the Wright language syntax, detailed in the
    /// Wright book docs.
    pub fn lex(&mut self) -> Result<(), LexerError> {
        let mut current_position = Position::new();
        let mut current_token = String::new();
        let mut current_line = String::new();
        let mut chars: Vec<char> = self.source.chars().collect();
        // Turns symbol list into workable vector of Vec<char>
        let symbol_char_pairs: Vec<Vec<char>> = Lexer::SYMBOLS
            .iter()
            .map(|x| x
                .to_string()
                .chars()
                .collect()
            ).collect();
        // reverse chars so that pop() and push() read L->R, Top->End
        chars.reverse();
        //println!("{:?}", symbol_char_pairs);
        // while there's another character
        'consumption : while let Some(character) = chars.pop() {
            current_position.increment_column();
            current_line.push(character);
            current_token.push(character);
            if is_symbol(character) {
                let mut possible_next_chars: HashSet<char> = HashSet::new();
                // go through every pair, and add the second character if it starts with `character`
                for pair in symbol_char_pairs.clone() {
                    if pair.len() == 2 && pair[0] == character {
                        if !possible_next_chars.contains(&pair[1]) {
                            possible_next_chars.insert(pair[1]);
                        }
                    }
                }
                // get the next character if possible
                if let Some(next_char) = chars.pop() {
                    if possible_next_chars.contains(&next_char) {
                        current_token.push(next_char);
                        current_line.push(next_char);
                        current_position.increment_column();
                        // special case with double char symbols (not quotes)
                        match current_token.clone().as_str() {
                            "//"|"/!" => {   // single line comment or single line doc comment
                                // if EOF is reached, this will just stop and
                                // push the current token.
                                'take_comment : while let Some(comment_char) = chars.pop() {
                                    // until end of line
                                    // follow principals of true loss-less lexing;
                                    // the newline character will be put in the token
                                    current_token.push(comment_char);
                                    current_line.push(comment_char);
                                    current_position.increment_column();
                                    if comment_char == '\n' {
                                        current_position.increment_line();
                                        current_line = String::new();
                                        break 'take_comment;
                                    }
                                }
                            },
                            "/*" => {   // multi line comments
                                let mut last = ' ';
                                'take_multiline_comment: while let Some(comment_char) = chars.pop(){
                                    current_position.increment_column();
                                    current_token.push(comment_char);
                                    if comment_char == '\n' {
                                        current_position.increment_line();
                                        current_line = String::new();
                                    }
                                    else if comment_char == '/' && last == '*'{
                                        break 'take_multiline_comment;
                                    }
                                    last = comment_char;
                                }

                            },
                            "/?" => {   // multi line doc comments
                                let mut last = ' ';
                                'take_multi_doc_comment: while let Some(comment_char) = chars.pop(){
                                    current_position.increment_column();
                                    current_token.push(comment_char);
                                    if comment_char == '\n' {
                                        current_position.increment_line();
                                        current_line = String::new();
                                    }
                                        else if comment_char == '/' && last == '?'{
                                            break 'take_multi_doc_comment;
                                        }
                                    last = comment_char;
                                }
                            },
                            _ => {},
                        }
                        // factored out of match statement
                        self.tokens.push(current_token);
                        current_token = String::new();
                        // move to next iteration
                    } else {
                        // put the next_char back on the char stack if it doesn't make a possible
                        // token
                        chars.push(next_char);
                        // single symbol token so far, no eof reached
                        match current_token.clone().as_str() {
                            "\"" => {
                                'take_quote : while let Some(quote_char) = chars.pop() {
                                    current_position.increment_column();
                                    current_line.push(quote_char);
                                    current_token.push(quote_char);
                                    if quote_char == '\n' {
                                        current_position.increment_line();
                                        current_line = String::new();
                                    }
                                    // escaped characters
                                    else if quote_char == '\\' {
                                        if let Some(escaped_char) = chars.pop() {
                                            current_position.increment_column();
                                            current_line.push(escaped_char);
                                            current_token.push(escaped_char);
                                            if escaped_char == '\n' {
                                                current_position.increment_line();
                                                current_line = String::new();
                                            }
                                        } else {
                                            // reach EOF and break.
                                            break 'take_quote;
                                        }
                                    }
                                    else if quote_char == '"' {     // end of quote reached
                                        break 'take_quote;
                                    }
                                }
                            },
                            "'" => {
                                'take_char_literal: while let Some(char_literal_char) = chars.pop() {
                                    current_position.increment_column();
                                    current_line.push(char_literal_char);
                                    current_token.push(char_literal_char);
                                    if char_literal_char == '\n' {
                                        current_position.increment_line();
                                        current_line = String::new();
                                    }
                                    // escaped characters
                                    else if char_literal_char == '\\' {
                                        if let Some(escaped_char) = chars.pop() {
                                            current_position.increment_column();
                                            current_line.push(escaped_char);
                                            current_token.push(escaped_char);
                                            if escaped_char == '\n' {
                                                current_position.increment_line();
                                                current_line = String::new();
                                            }
                                        } else {
                                            // reach EOF and break.
                                            break 'take_char_literal;
                                        }
                                    }
                                    else if char_literal_char == '\'' {     // end of quote reached
                                        break 'take_char_literal;
                                    }
                                }
                            },
                            _ => {}, //otherwise do nothing
                        }
                    }
                }
                // found one symbol character and reached eof...
                self.tokens.push(current_token);
                current_token = String::new();

            }
            else if is_alpha(character) {
                // take chars for an identifier. (a-z, 0-9, _)
                // is_alpha could also imply the start of a keyword
                // but that doesn't really matter at this point.
                'take_identifier : while let Some(next_char) = chars.pop() {
                    if is_alphanumeric(next_char) || next_char == '_' {
                        current_position.increment_column();
                        current_token.push(next_char);
                        current_line.push(next_char);
                    } else {
                        chars.push(next_char);
                        self.tokens.push(current_token);
                        current_token = String::new();
                        break 'take_identifier;
                    }
                }
            }
            else if is_digit(character) {
                // take chars for a number literal.
                let mut had_decimal = false;
                if character == '0' {
                    if let Some('x') = chars.pop() {
                        current_token.push('x');
                        current_line.push('x');
                        current_position.increment_column();
                        'take_hex_literal: while let Some(next_char) = chars.pop() {
                            if is_hex_digit(next_char) {
                                current_position.increment_column();
                                current_line.push(next_char);
                                current_token.push(next_char);
                            }
                            else {
                                // not a digit; put it back
                                chars.push(next_char);
                                break 'take_hex_literal;
                            }
                        }
                    }
                    else if let Some('b') = chars.pop() {
                        current_token.push('b');
                        current_line.push('b');
                        current_position.increment_column();
                        'take_bin_literal: while let Some(next_char) = chars.pop() {
                            if is_bin_digit(next_char) {
                                current_position.increment_column();
                                current_line.push(next_char);
                                current_token.push(next_char);
                            }
                            else {
                                // not a digit; put it back
                                chars.push(next_char);
                                break 'take_bin_literal;
                            }
                        }
                    }
                    else {
                        // very similar to below, takes for decimal or integer.
                        'take_num_literal: while let Some(next_char) = chars.pop() {
                            if is_digit(next_char) || (next_char == '.' && !had_decimal) {
                                current_position.increment_column();
                                current_line.push(next_char);
                                current_token.push(next_char);
                                if next_char == '.' {had_decimal = true;}
                            }
                            else {
                                // not a digit; put it back
                                chars.push(next_char);
                                break 'take_num_literal;
                            }
                        }
                    }
                } else {
                    // not a hex or binary
                    let mut had_decimal = false;
                    'take_dec_literal: while let Some(next_char) = chars.pop() {
                        if is_digit(next_char) || (next_char == '.' && !had_decimal) {
                            current_position.increment_column();
                            current_line.push(next_char);
                            current_token.push(next_char);
                            if next_char == '.' {had_decimal = true;}
                        }
                        else {
                            // not a digit; put it back
                            chars.push(next_char);
                            break 'take_dec_literal;
                        }
                    }
                }
                // push token and reset
                // this code is common to all possible lexemes, and is therefore factored out
                self.tokens.push(current_token);
                current_token = String::new();
            }
            else {
                if current_token == "\n".to_string() {
                    current_position.increment_line();
                    current_line = String::new();
                }
                self.tokens.push(current_token);
                current_token = String::new();
            }
        }
        return Ok(());
    }
}