1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
use super::error::ProgramError;
use super::Token;

pub type LexerResult = Result<Token, ProgramError>;

/// A lexer (tokenizer) for a mathematical expression.
///
/// The Lexer programatically tokenizes one or more bytes of a program, with safety checks to
/// enable notification that the program has concluded (EOF). Each Token is individually useless
/// unless further processed; for instance, by the Parser.
///
/// As an example, an abstract representation of a program's tokenization is shown:
///
/// ```ignore
/// in:
///     123 + 281 ./ 3
///     ^~~ ^ ^~~ ^~ ^
/// out (over 5 iterations):
///     1. INTEGER(123)
///     2. PLUS
///     3. INTEGER(281)
///     4. DIVIDEINT
///     5. INTEGER(3)
/// ```
///
/// We expect the Parser to be agnostic to the significance and relationship of each byte in the
/// program, instead concerned only with the creation and typing of bytes. As such, the Lexer is
/// built to refrain from analyzing anything but the values of bytes in a program.
pub struct Lexer<'a> {
    text: &'a str,
    size: usize,
    ptr: usize,
    chr: Option<char>,
}

impl<'a> Lexer<'a> {
    /// Creates a Lexer from a text program.
    pub fn from(text: &str) -> Lexer {
        Lexer {
            text: &text,
            size: text.len(),
            ptr: 0,
            chr: Some(text.as_bytes()[0] as char),
        }
    }

    /// Pushes the Lexer's character pointer by one increment and sets the Lexer's next character
    /// to the one pointed at in the program. If the pointer extends past the end of the program,
    /// the character is set to `None`.
    fn push_ptr(&mut self) {
        self.ptr += 1;
        if self.ptr < self.size {
            self.chr = Some(self.text.as_bytes()[self.ptr] as char);
        } else {
            self.chr = None;
        }
    }

    /// Gives the next `amt` characters in the program as a String. If there are less characters
    /// remaining in the program than `amt`, then only the remaining characters are given.
    fn peek_ahead(&self, amt: &usize) -> String {
        let mut result = String::new();
        let next_ptr = self.ptr;
        for pos in (next_ptr)..(next_ptr + amt) {
            if pos < self.size {
                result.push(self.text.as_bytes()[pos] as char);
            } else {
                break;
            }
        }
        result
    }

    /// Pushes the Lexer's pointer past any whitespace in the program, or until the end of the
    /// program.
    fn skip_whitespace(&mut self) {
        while let Some(chr) = self.chr {
            if chr.is_whitespace() {
                self.push_ptr();
            } else {
                break;
            }
        }
    }

    /// Build an integer in the program until a non-digit is found or the end of program is
    /// reached.
    fn build_integer(&mut self) -> usize {
        let mut result = String::new();
        while let Some(chr) = self.chr {
            if chr.is_digit(10) {
                result.push(self.chr.unwrap());
                self.push_ptr();
            } else {
                break;
            }
        }
        result.parse::<usize>().unwrap()
    }

    /// Skips `amt` Tokens and returns the current Lexer. Primarily intended for use in method
    /// chaining:
    ///
    /// ```
    /// # mod rcalc;
    /// # use rcalc::{Lexer, Token};
    /// let mut lexer = Lexer::from("2 + 3");
    /// let result = lexer.skip(1).next_token().unwrap(); // => Token::PLUS
    /// # assert_eq!(result, Token::PLUS);
    /// ```
    pub fn skip(&mut self, amt: usize) -> &mut Lexer<'a> {
        for _ in 0..amt {
            match self.next_token() {
                _ => (),
            };
        }
        self
    }

    /// Tokenizes the next one or more characters in the program. If no valid Token can be made,
    /// the Lexer panics.
    ///
    /// ```
    /// # mod rcalc;
    /// # use rcalc::{Lexer, Token};
    /// let mut lexer = Lexer::from("2 + 3");
    /// let result = lexer.next_token().unwrap(); // => Token::NUMBER(2)
    /// # assert_eq!(result, Token::NUMBER(2));
    /// ```
    pub fn next_token(&mut self) -> LexerResult {
        while let Some(chr) = self.chr {
            if chr.is_whitespace() {
                self.skip_whitespace();
                continue;
            } else if chr.is_digit(10) {
                return Ok(Token::NUMBER(self.build_integer()));
            } else {
                self.push_ptr();
                match chr {
                    // Operands
                    '+' => return Ok(Token::PLUS),
                    '-' => return Ok(Token::MINUS),
                    '*' => return Ok(Token::MULTIPLY),
                    '/' => return Ok(Token::DIVIDE),
                    '%' => return Ok(Token::MODULO),
                    '^' => return Ok(Token::EXPONENT),
                    '!' => return Ok(Token::FACTORIAL),
                    '.' => {
                        match self.peek_ahead(&1).as_ref() {
                            "/" => {
                                self.push_ptr();
                                return Ok(Token::DIVIDEINT);
                            }
                            _ => {
                                return Err(ProgramError::of(
                                    "InvalidOperand",
                                    "Invalid multi-character operand.",
                                ))
                            }
                        }
                    }

                    // Controls
                    '(' => return Ok(Token::LPAREN),
                    ')' => return Ok(Token::RPAREN),

                    // Error
                    _ => {
                        return Err(ProgramError::of(
                            "InvalidCharacter",
                            "Expected Token, found unknown character.",
                        ))
                    }
                }
            }
        }

        Ok(Token::EOF)
    }
}