tilted/
lexer.rs

1//! This modules implements a lexer, or tokeniser, for [`tilted`](crate).
2//!
3//! A lexer's job is to generate a stream of [`Token`]s from user input, which
4//! is used by the [`Parser`] to generate an Abstract Syntax Tree.
5
6use std::{fmt::Display, ops::Index, slice::SliceIndex};
7
8#[cfg(feature = "serde")]
9use serde::{Deserialize, Serialize};
10
11use crate::{eof, token, LexError};
12
13/// Special [`Result`] type for the lexer.
14type Result<T> = std::result::Result<T, LexError>;
15
16/// Lexer for [`tilted`](crate). It parses user input and return [`Token`]s.
17#[derive(Debug, Clone)]
18#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
19pub struct Lexer {
20    /// The original source code that is passed in.
21    source_code: Box<str>,
22
23    /// The index of the current character, i.e. the one that is parsed next.
24    current_index: usize,
25}
26
27/// Part of the source code tokenised. Returned by a [`Lexer`].
28#[derive(Debug, Clone, Copy, PartialEq)]
29#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
30pub struct Token {
31    /// Type of this [`Token`].
32    pub kind: TokenKind,
33
34    /// Location of this [`Token`].
35    pub span: Span,
36}
37
38/// Type of a [`Token`], also containing the information associated.
39#[derive(Debug, Clone, Copy, PartialEq)]
40#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
41pub enum TokenKind {
42    /// End-of-file token. Note that the [`Span`] associated with EOF is
43    /// out-of-bound, meaning if the span is used to look up source code, the
44    /// slice will be zero-length.
45    Eof,
46
47    /// Integer, i.e. numbers without decimal places.
48    Int(u64),
49
50    /// Floating-point number, i.e. real numbers that are not integers.
51    Flt(f64),
52
53    /// Operator.
54    Op(Operator),
55
56    /// Function.
57    Func(Function),
58
59    /// Left parenthesis.
60    LeftParen,
61
62    /// Right parenthesis.
63    RightParen,
64}
65
66/// Functions.
67#[derive(Debug, Clone, Copy, PartialEq)]
68#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
69pub enum Function {
70    /// Sine.
71    Sin,
72
73    /// Cosine.
74    Cos,
75
76    /// Tangent.
77    Tan,
78
79    /// Cosecant.
80    Csc,
81
82    /// Secant.
83    Sec,
84
85    /// Cotangent.
86    Cot,
87
88    /// Inverse sine.
89    Asin,
90
91    /// Inverse cosine.
92    Acos,
93
94    /// Inverse tangent.
95    Atan,
96
97    /// Inverse cosecant.
98    Acsc,
99
100    /// Inverse secant.
101    Asec,
102
103    /// Inverse cotangent.
104    Acot,
105}
106
107/// Basic mathematical operators.
108#[derive(Debug, Clone, Copy, PartialEq)]
109#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
110pub enum Operator {
111    /// Operator `+`.
112    Plus,
113
114    /// Operator `-`
115    Minus,
116
117    /// Operator `*`.
118    Star,
119
120    /// Operator `/`.
121    Slash,
122
123    /// Operator `^`.
124    Caret,
125}
126
127/// Spatial information of a [`Token`].
128#[derive(Debug, Clone, Copy, PartialEq)]
129#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
130pub struct Span {
131    /// Index of the first character of this [`Span`].
132    pub start_index: usize,
133
134    /// Index of the last character of this [`Span`].
135    pub end_index: usize,
136}
137
138impl Display for Token {
139    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
140        write!(f, "{:?} at index {}", self.kind, self.span.start_index)
141    }
142}
143
144impl From<char> for Operator {
145    fn from(value: char) -> Self {
146        match value {
147            '+' => Self::Plus,
148            '-' => Self::Minus,
149            '*' => Self::Star,
150            '/' => Self::Slash,
151            '^' => Self::Caret,
152
153            // This also guards against attempts to add new operators
154            // without implementing its conversion.
155            _ => unreachable!("Unknown operator conversion"),
156        }
157    }
158}
159
160impl TryFrom<&str> for Function {
161    type Error = ();
162    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
163        match value {
164            "sin" => Ok(Self::Sin),
165            "cos" => Ok(Self::Cos),
166            "tan" => Ok(Self::Tan),
167            "csc" => Ok(Self::Csc),
168            "sec" => Ok(Self::Sec),
169            "cot" => Ok(Self::Cot),
170            "asin" => Ok(Self::Asin),
171            "acos" => Ok(Self::Acos),
172            "atan" => Ok(Self::Atan),
173            "acsc" => Ok(Self::Acsc),
174            "asec" => Ok(Self::Asec),
175            "acot" => Ok(Self::Acot),
176            _ => Err(()),
177        }
178    }
179}
180
181impl Display for Function {
182    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
183        match self {
184            Function::Sin => write!(f, "Sin"),
185            Function::Cos => write!(f, "Cos"),
186            Function::Tan => write!(f, "Tan"),
187            Function::Csc => write!(f, "Csc"),
188            Function::Sec => write!(f, "Sec"),
189            Function::Cot => write!(f, "Cot"),
190            Function::Asin => write!(f, "Asin"),
191            Function::Acos => write!(f, "Acos"),
192            Function::Atan => write!(f, "Atan"),
193            Function::Acsc => write!(f, "Acsc"),
194            Function::Asec => write!(f, "Asec"),
195            Function::Acot => write!(f, "Acot"),
196        }
197    }
198}
199
200impl<Idx: SliceIndex<str>> Index<Idx> for Lexer {
201    type Output = Idx::Output;
202
203    fn index(&self, index: Idx) -> &Self::Output {
204        &self.source_code[index]
205    }
206}
207
208impl Index<Span> for Lexer {
209    type Output = str;
210
211    fn index(&self, index: Span) -> &Self::Output {
212        &self[index.start_index..(index.end_index - 1)]
213    }
214}
215
216impl Iterator for Lexer {
217    type Item = Token;
218
219    fn next(&mut self) -> Option<Self::Item> {
220        match self.lex() {
221            Ok(token) if token.kind != TokenKind::Eof => Some(token),
222            _ => None,
223        }
224    }
225}
226
227impl Lexer {
228    /// Creates a new [`Lexer`] from source code.
229    #[allow(unused)]
230    pub fn from_source_code<T: AsRef<str>>(source_code: T) -> Lexer {
231        Lexer {
232            source_code: Box::from(source_code.as_ref()),
233            current_index: 0,
234        }
235    }
236
237    /// Gets the next [`Token`] from source.
238    pub fn lex(&mut self) -> Result<Token> {
239        // Skip whitespaces.
240        for c in self.source_code.chars().skip(self.current_index) {
241            if c.is_whitespace() {
242                self.current_index += 1;
243            } else {
244                break;
245            }
246        }
247
248        // Check for EOF.
249        if self.current_index >= self.source_code.len() {
250            return Ok(eof!(self.current_index));
251        }
252
253        // Assign to handlers based on the next character.
254        match self
255            .source_code
256            .chars()
257            .nth(self.current_index)
258            .ok_or(LexError::InternalError(
259                "Unable to unwrap next character in source",
260                self.current_index,
261            ))? {
262            // Numbers (integers and reals)
263            // Can start with a dot or number
264            '.' | '0'..='9' => self.handle_number(),
265
266            // Operators.
267            '+' | '-' | '*' | '/' | '^' => self.handle_operator(),
268
269            // Parentheses.
270            // These are short so they are handled in-place.
271            '(' => {
272                self.current_index += 1;
273                Ok(token!(TokenKind::LeftParen, self.current_index - 1, 1))
274            }
275            ')' => {
276                self.current_index += 1;
277                Ok(token!(TokenKind::RightParen, self.current_index - 1, 1))
278            }
279
280            // Functions.
281            c if c.is_ascii_alphabetic() => self.handle_function(),
282
283            // Any other characters.
284            c => Err(LexError::UnrecognisedCharacter(c, self.current_index)),
285        }
286    }
287
288    pub fn handle_number(&mut self) -> Result<Token> {
289        // Keep track of the original index for later.
290        let original_index = self.current_index;
291
292        // Tracker for decimal place.
293        let mut seen_dot = false;
294
295        // Reserve enough space for a 100-char string.
296        // Most numbers (hopefully) are within this limit. However, we still
297        // need to cover the potential cases of more than 100 digits.
298        let mut result = String::with_capacity(100);
299        for c in self.source_code[self.current_index..].chars() {
300            match c {
301                '.' => {
302                    if !seen_dot {
303                        // Dot (if not seen)
304                        self.current_index += 1;
305                        seen_dot = true;
306                        result.push('.');
307                    } else {
308                        // Dot (if already seen)
309                        return Err(LexError::UnrecognisedCharacter(c, self.current_index));
310                    }
311                }
312
313                // Digit
314                c if c.is_ascii_digit() => {
315                    result.push(c);
316                    self.current_index += 1;
317                }
318
319                // Anything else
320                _ => break,
321            }
322        }
323
324        // Convert string to integer or float based on seen_dot.
325        if seen_dot {
326            // Float
327            let num = result
328                .parse::<f64>()
329                .map_err(|_| LexError::InternalError("Parse float failed", self.current_index))?;
330
331            Ok(token!(TokenKind::Flt(num), original_index, result.len()))
332        } else {
333            // Integer
334            let num = result
335                .parse::<u64>()
336                .map_err(|_| LexError::InternalError("Parse integer failed", self.current_index))?;
337
338            Ok(token!(TokenKind::Int(num), original_index, result.len()))
339        }
340    }
341
342    pub fn handle_operator(&mut self) -> Result<Token> {
343        // Operator has only one char so it should be trivial.
344        let op =
345            self.source_code
346                .chars()
347                .nth(self.current_index)
348                .ok_or(LexError::InternalError(
349                    "Unable to unwrap operator",
350                    self.current_index,
351                ))?;
352
353        // The parent match operator should have narrowed down the valid ones,
354        // but I think it is still important to check here, just in case I mess
355        // up somewhere else. Resources are cheap anyway :)
356        match op {
357            '+' | '-' | '*' | '/' | '^' => {
358                self.current_index += 1;
359                Ok(token!(TokenKind::Op(op.into()), self.current_index - 1, 1))
360            }
361            _ => Err(LexError::InternalError(
362                "Invalid operator inside operator handler",
363                self.current_index,
364            )),
365        }
366    }
367
368    pub fn handle_function(&mut self) -> Result<Token> {
369        // Keep track of the original index for later.
370        let original_index = self.current_index;
371
372        // Trigos only contain letters.
373        let name = self.source_code[self.current_index..]
374            .chars()
375            .take_while(|c| c.is_ascii_alphabetic())
376            .collect::<String>();
377
378        // Convert string to trigonometric function.
379        let trigo = name
380            .as_str()
381            .try_into()
382            .map_err(|_| LexError::UnrecognisedFunction(name.clone(), self.current_index))?;
383
384        // Update current index.
385        self.current_index += name.len();
386
387        Ok(token!(TokenKind::Func(trigo), original_index, name.len()))
388    }
389
390    /// Reverts this [`Lexer`] to its original state.
391    #[allow(unused)]
392    pub fn reset(&mut self) {
393        // Simply set the index to 0 to reset.
394        self.current_index = 0;
395    }
396}