rant/compiler/
lexer.rs

1use logos::*;
2use crate::InternalString;
3
4// Module keywords
5pub const KW_REQUIRE: &str = "require";
6
7// Control flow keywords
8pub const KW_RETURN: &str = "return";
9pub const KW_BREAK: &str = "break";
10pub const KW_CONTINUE: &str = "continue";
11pub const KW_WEIGHT: &str = "weight";
12pub const KW_IF: &str = "if";
13pub const KW_ELSEIF: &str = "elseif";
14pub const KW_ELSE: &str = "else";
15
16// Value constant keywords
17pub const KW_TRUE: &str = "true";
18pub const KW_FALSE: &str = "false";
19
20// Hinting keywords
21pub const KW_TEXT: &str = "text";
22
23// Output modifier keywords
24pub const KW_EDIT: &str = "edit";
25
26// Infix operator keywords
27pub const KW_NEG: &str = "neg";
28pub const KW_NOT: &str = "not";
29pub const KW_EQ: &str = "eq";
30pub const KW_NEQ: &str = "neq";
31pub const KW_GT: &str = "gt";
32pub const KW_GE: &str = "ge";
33pub const KW_LT: &str = "lt";
34pub const KW_LE: &str = "le";
35
36pub fn is_valid_keyword_name(kw_name: &str) -> bool {
37  matches!(kw_name, 
38    KW_REQUIRE |
39    KW_RETURN | KW_BREAK | KW_CONTINUE | KW_WEIGHT | KW_IF | KW_ELSEIF | KW_ELSE |
40    KW_TRUE | KW_FALSE | KW_TEXT | KW_EDIT |
41    KW_NEG | KW_NOT |
42    KW_EQ | KW_NEQ | KW_GT | KW_GE | KW_LT | KW_LE
43  )
44}
45
46#[derive(Debug, PartialEq)]
47pub struct KeywordInfo {
48  pub name: InternalString,
49  pub is_valid: bool,
50}
51
52/// Represents the contents of a positive float literal token.
53#[derive(Debug, PartialEq)]
54pub enum PositiveFloatToken {
55  Value(f64),
56  OutOfRange,
57}
58
59/// Represents the contents of a positive integer literal token.
60#[derive(Debug, PartialEq)]
61pub enum PositiveIntegerToken {
62  Value(u64),
63  OutOfRange,
64}
65
66/// Represents an escape sequence output.
67#[derive(Debug, PartialEq)]
68pub enum ParsedEscape {
69  Char(char),
70  InvalidChar(char),
71  InvalidUnicode(String),
72}
73
74#[derive(Logos, Debug, PartialEq)]
75pub enum RantToken {
76  /// Sequence of printable non-whitespace characters that isn't a number
77  /// This regex is so crazy because simply doing [\w\-_]+ would accidentally capture negative numbers
78  #[error]
79  #[regex(r"([0-9]+(\.[0-9]+([Ee][+\-]?\d+)?|[Ee][+\-]?\d+)?[\p{L}\-_]|[\w_][\p{L}\-_]|\-[\p{L}\-_])[\w\-_]*", priority = 1)]
80  Fragment,
81
82  /// Sequence of printable whitespace characters
83  #[regex(r"\s+", filter_bs, priority = 2)]
84  Whitespace,
85  
86  /// Sequence of non-printable whitespace characters
87  #[regex(r"[\r\n]+\s*|\s*[\r\n]+", logos::skip, priority = 3)]
88  IgnoredWhitespace,
89
90  /// `-`
91  #[token("-", priority = 10)]
92  Minus,
93
94  /// `-=`
95  #[token("-=", priority = 11)]
96  MinusEquals,
97  
98  /// `{`
99  #[token("{")]
100  LeftBrace,
101  
102  /// `|`
103  #[token("|")]
104  VertBar,
105
106  /// `|=`
107  #[token("|=")]
108  VertBarEquals,
109  
110  /// `}`
111  #[token("}")]
112  RightBrace,
113
114  /// `|>`
115  #[token("|>")]
116  PipeOp,
117
118  /// `[]`
119  #[token("[]")]
120  PipeValue,
121  
122  /// `[`
123  #[token("[")]
124  LeftBracket,
125  
126  /// `]`
127  #[token("]")]
128  RightBracket,
129
130  /// `(`
131  #[token("(")]
132  LeftParen,
133
134  /// `)`
135  #[token(")")]
136  RightParen,
137
138  /// `<>`
139  #[token("<>")]
140  NothingLiteral,
141
142  /// `<`
143  #[token("<")]
144  LeftAngle,
145
146  /// `>`
147  #[token(">")]
148  RightAngle,
149  
150  /// `:`
151  #[token(":")]
152  Colon,
153
154  /// `::`
155  #[token("::")]
156  DoubleColon,
157
158  /// `..`
159  #[token("..")]
160  DoubleDot,
161
162  /// `**`
163  #[token("**")]
164  DoubleStar,
165
166  /// `**=`
167  #[token("**=")]
168  DoubleStarEquals,
169  
170  /// Labeled temporal operator, e.g. `*a*`
171  #[regex(r"\*[\w\-_][\w\d\-_]*\*", parse_temporal_spread_label)]
172  TemporalLabeled(InternalString),
173
174  /// `*`
175  #[token("*")]
176  Star,
177
178  /// `*=`
179  #[token("*=")]
180  StarEquals,
181  
182  /// `+`
183  #[token("+")]
184  Plus,
185
186  /// `+=`
187  #[token("+=")]
188  PlusEquals,
189
190  /// `=`
191  #[token("=")]
192  Equals,
193  
194  /// `?`
195  #[token("?")]
196  Question,
197  
198  /// `;`
199  #[token(";")]
200  Semicolon,
201  
202  /// `@`
203  #[token("@", priority = 1)]
204  At,
205
206  /// Keyword, e.g. `@return`
207  #[regex(r"@[a-z0-9_-]+", parse_keyword, priority = 2, ignore(case))]
208  Keyword(KeywordInfo),
209  
210  /// `/`
211  #[token("/")]
212  Slash,
213
214  /// `/=`
215  #[token("/=")]
216  SlashEquals,
217
218  /// `^`
219  #[token("^")]
220  Caret,
221
222  /// `^=`
223  #[token("^=")]
224  CaretEquals,
225  
226  /// `$`
227  #[token("$")]
228  Dollar,
229
230  /// `%`
231  #[token("%")]
232  Percent,
233
234  /// `%=`
235  #[token("%=")]
236  PercentEquals,
237  
238  /// <code>`</code>
239  #[token("`")]
240  Hint,
241  
242  /// `~`
243  #[token("~")]
244  Sink,
245
246  /// `&`
247  #[token("&")]
248  And,
249
250  /// `&=`
251  #[token("&=")]
252  AndEquals,
253  
254  /// Unsigned integer literal
255  #[regex(r"[0-9]+", parse_integer, priority = 2)]
256  IntegerPositive(PositiveIntegerToken),
257  
258  /// Unsigned floating-point literal
259  #[regex(r"[0-9]+(\.[0-9]+([Ee][+\-]?\d+)?|[Ee][+\-]?\d+)", parse_float, priority = 3)]
260  FloatPositive(PositiveFloatToken),
261  
262  /// Represents inline and multi-line comments
263  #[regex(r"\s*##([^#]|#[^#])*(##\s*)?", logos::skip, priority = 6)]
264  #[regex(r"\s*#([^#][^\r\n]*)?\n?", logos::skip, priority = 5)]
265  Comment,
266  
267  /// Represents any escape sequence
268  #[regex(r"\\\S", parse_escape, priority = 10)]
269  #[regex(r"\\x\S\S", parse_byte_escape, priority = 11)]
270  #[regex(r"\\u\S\S\S\S", parse_unicode_escape, priority = 11)]
271  #[regex(r"\\U\S\S\S\S\S\S\S\S", parse_unicode_escape, priority = 11)]
272  #[regex(r"\\U\(\S+\)", parse_unicode_unsized_escape, priority = 12)]
273  Escape(ParsedEscape),
274  
275  /// Represents a verbatim string literal, e.g. `"hello world"`
276  #[regex(r#""(""|[^"])*""#, parse_string_literal)]
277  StringLiteral(InternalString),
278  
279  /// Error token indicating an unterminated string literal, e.g. `"foo`
280  #[regex(r#""(""|[^"])*"#)]
281  UnterminatedStringLiteral,
282}
283
284fn parse_temporal_spread_label(lex: &mut Lexer<RantToken>) -> InternalString {
285  let slice = lex.slice();
286  InternalString::from(&slice[1 .. slice.len() - 1])
287}
288
289fn parse_string_literal(lex: &mut Lexer<RantToken>) -> InternalString {
290  let literal = lex.slice();
291  let literal_content = &literal[1..literal.len() - 1];
292  let mut string_content = InternalString::new();
293  let mut prev_quote = false;
294  for c in literal_content.chars() {
295    match c {
296      '"' => {
297        if prev_quote {
298          prev_quote = false;
299          string_content.push('"');
300        } else {
301          prev_quote = true;
302        }
303      },
304      c => {
305        string_content.push(c)
306      }
307    }
308  }
309  string_content
310}
311
312fn parse_keyword(lex: &mut Lexer<RantToken>) -> KeywordInfo {
313  let kwd_literal = lex.slice();
314  let kwd_content = &kwd_literal[1..];
315  KeywordInfo {
316    is_valid: is_valid_keyword_name(kwd_content),
317    name: InternalString::from(kwd_content),
318  }
319}
320
321/// Filter function for whitespace lexer rule to exclude whitespace at start of source
322fn filter_bs(lex: &mut Lexer<RantToken>) -> Filter<()> {
323  if lex.span().start > 0 {
324    return Filter::Emit(())
325  }
326  Filter::Skip
327}
328
329fn parse_escape(lex: &mut Lexer<RantToken>) -> ParsedEscape {
330  let slice = lex.slice();
331  ParsedEscape::Char(match slice.chars().nth(1).unwrap() {
332    'r' => '\r',
333    'n' => '\n',
334    't' => '\t',
335    '0' => '\0',
336    's' => ' ',
337    c @ (
338      '(' | ')' | '[' | ']' | '{' | '}' | '<' | '>' | 
339      '\\' | '@' | ':' | ';' | '|' | '"' |
340      '+' | '-' | '*' | '/' | '$' | '%' | '`' | '~' | '^'
341    ) => c,
342    c => return ParsedEscape::InvalidChar(c)
343  })
344}
345
346fn parse_byte_escape(lex: &mut Lexer<RantToken>) -> ParsedEscape {
347  let slice = &lex.slice()[2..];
348  let c = u8::from_str_radix(slice, 16).ok().map(char::from);
349  match c {
350    Some(c) => ParsedEscape::Char(c),
351    None => ParsedEscape::InvalidUnicode(slice.to_owned()),
352  }
353}
354
355fn parse_unicode_escape(lex: &mut Lexer<RantToken>) -> ParsedEscape {
356  let slice = &lex.slice()[2..];
357  let c = u32::from_str_radix(slice, 16).ok().and_then(char::from_u32);
358  match c {
359    Some(c) => ParsedEscape::Char(c),
360    None => ParsedEscape::InvalidUnicode(slice.to_owned()),
361  }
362}
363
364fn parse_unicode_unsized_escape(lex: &mut Lexer<RantToken>) -> ParsedEscape {
365  let len = lex.slice().len();
366  let codepoint_len = len - 4;
367  let slice = &lex.slice()[3..(len - 1)];
368  if codepoint_len > 8 { return ParsedEscape::InvalidUnicode(slice.to_owned()) }
369  let c = u32::from_str_radix(slice, 16).ok().and_then(char::from_u32);
370  match c {
371    Some(c) => ParsedEscape::Char(c),
372    None => ParsedEscape::InvalidUnicode(slice.to_owned()),
373  }
374}
375
376fn parse_float(lex: &mut Lexer<RantToken>) -> PositiveFloatToken {
377  let slice = lex.slice();
378  match slice.parse() {
379    Ok(f) => PositiveFloatToken::Value(f),
380    Err(_) => PositiveFloatToken::OutOfRange,
381  }
382}
383
384fn parse_integer(lex: &mut Lexer<RantToken>) -> PositiveIntegerToken {
385  let slice = lex.slice();
386  match slice.parse() {
387    Ok(i) => PositiveIntegerToken::Value(i),
388    Err(_) => PositiveIntegerToken::OutOfRange,
389  }
390}