nickel_lang_parser/
lexer.rs

1//! The lexer, transforming an input string to a stream of tokens.
2//!
3//! A modal lexer is implemented on top of two standard
4//! [logos](https://github.com/maciejhirsz/logos) lexers in order to support arbitrary interpolated
5//! expressions, which is not possible using LALRPOP's generated lexer. To see why, consider the
6//! following string:
7//!
8//! ```text
9//! "hello, I have 1 + %{ {a = "40"}.a } + 1 bananas."
10//! ```
11//!
12//! Once the `%{` token is encountered, the lexer has to switch back to lexing expressions as
13//! usual. But at the end of the interpolated expression, `+ 1 bananas.` needs to be parsed as a
14//! string again, and not as normal program tokens. Since the interpolated expression is arbitrary,
15//! it can contains nested `{` and `}` (as here, with records) and strings which themselves have
16//! interpolated expression, and so on.
17//!
18//! This is typically not lexable using only regular expressions. To handle this, we use a *modal*
19//! lexer. As hinted by the name, a modal lexer have several modes in which the same tokens can be
20//! parsed differently. Ours can be in *normal* mode or in *string* mode.
21//!
22//! It also maintains a stack of brace counters, required inside an interpolated expression to
23//! decide if a closing brace `}` belongs to the expression or is actually the closing brace of the
24//! interpolated expression, indicating that we should switch back to string mode.
25//!
26//! When entering a string, the `Str` mode is entered. When a `%{` is encountered in a string,
27//! starting an interpolated expression, the normal mode is pushed. At each starting `{` in normal
28//! mode, the brace counter is incremented. At each closing '}', it is decremented. When it reaches
29//! `0`, this is the end of the current interpolated expressions, and we leave the normal mode and
30//! go back to string mode. In our example, this is the second `}`: at this point, the lexer knows
31//! that the coming characters must be lexed as string tokens, and not as normal tokens.
32use crate::{
33    ast::Number,
34    error::LexicalError,
35    utils::{parse_number_base, parse_number_sci},
36};
37
38use logos::Logos;
39use std::ops::Range;
40
41fn symbolic_string_prefix_and_length<'input>(
42    lex: &mut logos::Lexer<'input, NormalToken<'input>>,
43) -> SymbolicStringStart<'input> {
44    let slice = lex.slice();
45    let (prefix, postfix) = slice
46        .rsplit_once('-')
47        .expect("The logos regexp ensures this succeeds");
48    SymbolicStringStart {
49        prefix,
50        length: postfix.len(),
51    }
52}
53
54// **IMPORTANT**
55// When adding or removing tokens that might be parsed as identifiers,
56// please update the [KEYWORDS] array
57/// The tokens in normal mode.
58#[derive(Logos, Debug, PartialEq, Clone)]
59#[logos(skip "((\r\n)+|[ \t\n]+)")]
60pub enum NormalToken<'input> {
61    // multiline strings cannot be used as enum tags, so we explicitly
62    // disallow that pattern.
63    #[regex("'m(%)+\"")]
64    // We forbid lone carriage returns for sanity
65    #[regex("\r[^\n]")]
66    Error,
67
68    // **IMPORTANT**
69    // This regex should be kept in sync with the one for RawEnumTag below, and
70    // also with the identifer regex in `std.package.Manifest`
71    #[regex("_*[a-zA-Z][_a-zA-Z0-9-']*")]
72    Identifier(&'input str),
73    #[regex("[0-9]*\\.?[0-9]+([eE][+\\-]?[0-9]+)?", |lex| parse_number_sci(lex.slice()).ok())]
74    DecNumLiteral(Number),
75    #[regex("0x[A-Fa-f0-9]+", |lex| parse_number_base(16, &lex.slice()[2..]).ok())]
76    HexNumLiteral(Number),
77    #[regex("0o[0-7]+", |lex| parse_number_base(8, &lex.slice()[2..]).ok())]
78    OctNumLiteral(Number),
79    #[regex("0b[01]+", |lex| parse_number_base(2, &lex.slice()[2..]).ok())]
80    BinNumLiteral(Number),
81
82    // **IMPORTANT**
83    // This regex should be kept in sync with the one for Identifier above.
84    #[regex("'_*[a-zA-Z][_a-zA-Z0-9-']*", |lex| lex.slice().split_at(1).1)]
85    RawEnumTag(&'input str),
86    #[token("'\"")]
87    StrEnumTagBegin,
88
89    #[token("Dyn")]
90    Dyn,
91    #[token("Number")]
92    Number,
93    #[token("Bool")]
94    Bool,
95    #[token("String")]
96    String,
97    #[token("Array")]
98    Array,
99
100    #[token("if")]
101    If,
102    #[token("then")]
103    Then,
104    #[token("else")]
105    Else,
106    #[token("forall")]
107    Forall,
108    #[token("in")]
109    In,
110    #[token("let")]
111    Let,
112    #[token("rec")]
113    Rec,
114    #[token("match")]
115    Match,
116
117    #[token("null")]
118    Null,
119    #[token("true")]
120    True,
121    #[token("false")]
122    False,
123    /// Or isn't a reserved keyword. It is a contextual keyword (a keyword that can be used as an
124    /// identifier because it's not ambiguous) within patterns.
125    #[token("or")]
126    Or,
127    /// As isn't a reserved keyword. It is a contextual keyword (a keyword that can be used as an
128    /// identifier because it's not ambiguous) within the `import xxx as yyy` construct.
129    #[token("as")]
130    As,
131    /// Include isn't a reserved keyword either. It is a contextual keyword (a keyword that can be
132    /// used as an identifier because it's not ambiguous) within a record literal.
133    #[token("include")]
134    Include,
135
136    #[token("?")]
137    QuestionMark,
138    #[token(",")]
139    Comma,
140    #[token(";")]
141    Semicolon,
142    #[token(":")]
143    Colon,
144    #[token("$")]
145    Dollar,
146    #[token("=")]
147    Equals,
148    #[token("!=")]
149    NotEquals,
150    #[token("&")]
151    Ampersand,
152    #[token(".")]
153    Dot,
154    #[token("\"")]
155    DoubleQuote,
156
157    #[token("+")]
158    Plus,
159    #[token("-")]
160    Minus,
161    #[token("*")]
162    Times,
163    #[token("/")]
164    Div,
165    #[token("%")]
166    Percent,
167    #[token("++")]
168    DoublePlus,
169    #[token("==")]
170    DoubleEq,
171    #[token("@")]
172    At,
173    #[token("&&")]
174    DoubleAnd,
175    #[token("||")]
176    DoublePipe,
177    #[token("!")]
178    Bang,
179    #[token("..")]
180    Ellipsis,
181
182    #[token("fun")]
183    Fun,
184    #[token("import")]
185    Import,
186    #[token("|")]
187    Pipe,
188    #[token("|>")]
189    RightPipe,
190    #[token("->")]
191    SimpleArrow,
192    #[token("=>")]
193    DoubleArrow,
194    #[token("_")]
195    Underscore,
196    #[regex("m(%+)\"", |lex| lex.slice().len())]
197    MultiStringStart(usize),
198    #[regex("[a-zA-Z][_a-zA-Z0-9-']*-s(%+)\"", symbolic_string_prefix_and_length)]
199    SymbolicStringStart(SymbolicStringStart<'input>),
200
201    #[token("%typeof%")]
202    Typeof,
203    #[token("%cast%")]
204    Cast,
205
206    #[token("%contract/apply%")]
207    ContractApply,
208    #[token("%contract/check%")]
209    ContractCheck,
210    #[token("%contract/array_lazy_apply%")]
211    ContractArrayLazyApp,
212    #[token("%contract/record_lazy_apply%")]
213    ContractRecordLazyApp,
214    #[token("%contract/custom%")]
215    ContractCustom,
216    #[token("%blame%")]
217    Blame,
218    #[token("%label/flip_polarity%")]
219    LabelFlipPol,
220    #[token("%label/polarity%")]
221    LabelPol,
222    #[token("%label/go_dom%")]
223    LabelGoDom,
224    #[token("%label/go_codom%")]
225    LabelGoCodom,
226    #[token("%label/go_field%")]
227    LabelGoField,
228    #[token("%label/go_array%")]
229    LabelGoArray,
230    #[token("%label/go_dict%")]
231    LabelGoDict,
232    #[token("%label/insert_type_variable%")]
233    LabelInsertTypeVar,
234    #[token("%label/lookup_type_variable%")]
235    LabelLookupTypeVar,
236
237    #[token("%seal%")]
238    Seal,
239    #[token("%unseal%")]
240    Unseal,
241    #[token("%enum/embed%")]
242    EnumEmbed,
243    #[token("%record/map%")]
244    RecordMap,
245    #[token("%record/insert%")]
246    RecordInsert,
247    #[token("%record/insert_with_opts%")]
248    RecordInsertWithOpts,
249    #[token("%record/remove%")]
250    RecordRemove,
251    #[token("%record/remove_with_opts%")]
252    RecordRemoveWithOpts,
253    #[token("%record/empty_with_tail%")]
254    RecordEmptyWithTail,
255    #[token("%record/seal_tail%")]
256    RecordSealTail,
257    #[token("%record/unseal_tail%")]
258    RecordUnsealTail,
259    #[token("%seq%")]
260    Seq,
261    #[token("%deep_seq%")]
262    DeepSeq,
263    #[token("%force%")]
264    OpForce,
265    #[token("%array/length%")]
266    ArrayLength,
267    #[token("%record/fields%")]
268    RecordFields,
269    #[token("%record/fields_with_opts%")]
270    RecordFieldsWithOpts,
271    #[token("%record/values%")]
272    RecordValues,
273
274    #[token("%number/arccos%")]
275    NumberArcCos,
276    #[token("%number/arcsin%")]
277    NumberArcSin,
278    #[token("%number/arctan%")]
279    NumberArcTan,
280    #[token("%number/arctan2%")]
281    NumberArcTan2,
282    #[token("%number/cos%")]
283    NumberCos,
284    #[token("%number/sin%")]
285    NumberSin,
286    #[token("%number/tan%")]
287    NumberTan,
288    #[token("%number/log%")]
289    NumberLog,
290    #[token("%pow%")]
291    Pow,
292    #[token("%trace%")]
293    Trace,
294
295    #[token("%record/has_field%")]
296    RecordHasField,
297    #[token("%record/has_field_with_opts%")]
298    RecordHasFieldWithOpts,
299    #[token("%array/map%")]
300    ArrayMap,
301    #[token("%array/at%")]
302    ArrayAt,
303    #[token("%array/generate%")]
304    ArrayGen,
305    #[token("%rec_force%")]
306    OpRecForce,
307    #[token("%rec_default%")]
308    OpRecDefault,
309    #[token("%record/field_is_defined%")]
310    RecordFieldIsDefined,
311    #[token("%record/field_is_defined_with_opts%")]
312    RecordFieldIsDefinedWithOpts,
313    #[token("%record/split_pair%")]
314    RecordSplitPair,
315    #[token("%record/disjoint_merge%")]
316    RecordDisjointMerge,
317    #[token("%record/merge_contract%")]
318    RecordMergeContract,
319    #[token("%record/freeze%")]
320    RecordFreeze,
321
322    #[token("default")]
323    Default,
324    #[token("doc")]
325    Doc,
326    #[token("optional")]
327    Optional,
328    #[token("priority")]
329    Priority,
330    #[token("force")]
331    Force,
332    #[token("not_exported")]
333    NotExported,
334
335    #[token("%hash%")]
336    OpHash,
337    #[token("%serialize%")]
338    Serialize,
339    #[token("%deserialize%")]
340    Deserialize,
341    #[token("%string/split%")]
342    StringSplit,
343    #[token("%string/trim%")]
344    StringTrim,
345    #[token("%string/chars%")]
346    StringChars,
347    #[token("%string/uppercase%")]
348    StringUppercase,
349    #[token("%string/lowercase%")]
350    StringLowercase,
351    #[token("%string/contains%")]
352    StringContains,
353    #[token("%string/compare%")]
354    StringCompare,
355    #[token("%string/replace%")]
356    StringReplace,
357    #[token("%string/replace_regex%")]
358    StringReplaceRegex,
359    #[token("%string/is_match%")]
360    StringIsMatch,
361    #[token("%string/find%")]
362    StringFind,
363    #[token("%string/find_all%")]
364    StringFindAll,
365    #[token("%string/length%")]
366    StringLength,
367    #[token("%string/substr%")]
368    StringSubstr,
369    #[token("%string/base64_encode%")]
370    StringBase64Encode,
371    #[token("%string/base64_decode%")]
372    StringBase64Decode,
373    #[token("%to_string%")]
374    ToString,
375    #[token("%number/from_string%")]
376    NumberFromString,
377    #[token("%enum/from_string%")]
378    EnumFromString,
379    #[token("%enum/get_arg%")]
380    EnumGetArg,
381    #[token("%enum/make_variant%")]
382    EnumMakeVariant,
383    #[token("%enum/is_variant%")]
384    EnumIsVariant,
385    #[token("%enum/get_tag%")]
386    EnumGetTag,
387
388    #[token("%label/with_message%")]
389    LabelWithMessage,
390    #[token("%label/with_notes%")]
391    LabelWithNotes,
392    #[token("%label/append_note%")]
393    LabelAppendNote,
394    #[token("%label/push_diag%")]
395    LabelPushDiag,
396    #[token("%array/slice%")]
397    ArraySlice,
398    #[token("%eval_nix%")]
399    EvalNix,
400
401    #[token("{")]
402    LBrace,
403    #[token("}")]
404    RBrace,
405    #[token("[")]
406    LBracket,
407    #[token("]")]
408    RBracket,
409    #[token("(")]
410    LParen,
411    #[token(")")]
412    RParen,
413    #[token("<")]
414    LAngleBracket,
415    #[token("<=")]
416    LessOrEq,
417    #[token(">")]
418    RAngleBracket,
419    #[token(">=")]
420    GreaterOrEq,
421    #[token("[|")]
422    EnumOpen,
423    #[token("|]")]
424    EnumClose,
425    #[regex("#[^\n]*", allow_greedy = true)]
426    LineComment,
427}
428
429pub const KEYWORDS: &[&str] = &[
430    "Dyn",
431    "Number",
432    "Bool",
433    "String",
434    "Array",
435    "if",
436    "then",
437    "else",
438    "forall",
439    "in",
440    "let",
441    "rec",
442    "match",
443    "null",
444    "true",
445    "false",
446    "fun",
447    "import",
448    "merge",
449    "default",
450    "doc",
451    "optional",
452    "priority",
453    "force",
454    "not_exported",
455];
456
457#[derive(Debug, Clone, PartialEq)]
458pub struct SymbolicStringStart<'input> {
459    /// The prefix for the symbolic string, e.g. `nix-s%""%` has prefix `"nix"`
460    pub prefix: &'input str,
461    /// The length of the string delimiter, excluding `prefix` and `-`. E.g. `nix-s%%""%` has
462    /// `length` 4, the length of `s%%"`
463    pub length: usize,
464}
465
466/// The tokens in string mode.
467#[derive(Logos, Debug, PartialEq, Eq, Clone)]
468pub enum StringToken<'input> {
469    // We forbid lone carriage returns for sanity
470    #[regex("\r[^\n]")]
471    Error,
472
473    #[regex("[^\"%\\\\]+", |lex| normalize_line_endings(lex.slice()))]
474    // Has lower matching priority than `Interpolation` according to Logos' rules.
475    #[token("%", |lex| String::from(lex.slice()))]
476    Literal(String),
477
478    #[token("\"")]
479    DoubleQuote,
480    #[token("%{")]
481    Interpolation,
482    #[regex("\\\\.", |lex| lex.slice().chars().nth(1))]
483    EscapedChar(char),
484    // Repetition range `{2}` was not supported at the time of writing this regex.
485    #[regex("\\\\x[A-Fa-f0-9][A-Fa-f0-9]", |lex| &lex.slice()[2..4])]
486    EscapedAscii(&'input str),
487    #[regex("\\\\u\\{[A-Fa-f0-9]{1,6}\\}", |lex| {
488        let len = lex.slice().len();
489        &lex.slice()[3..(len - 1)]
490    })]
491    EscapedUnicode(&'input str),
492}
493
494/// The tokens in multiline string mode.
495#[derive(Logos, Debug, PartialEq, Eq, Clone)]
496pub enum MultiStringToken<'input> {
497    // We forbid lone carriage returns for sanity
498    #[regex("\r[^\n]")]
499    Error,
500
501    #[regex("[^\"%]+", |lex| normalize_line_endings(lex.slice()))]
502    // A token that starts as a multiline end delimiter or an interpolation sequence but is not
503    // one.  These ones should have lowest matching priority according to Logos' rules, and
504    // CandidateEnd and CandidateInterpolation should be matched first.
505    #[token("\"", |lex| String::from(lex.slice()))]
506    #[regex("%+", |lex| String::from(lex.slice()))]
507    Literal(String),
508
509    /// A candidate end. A multiline string starting delimiter `MultiStringStart` can have a
510    /// variable number of `%` character, so the lexer matches candidate end delimiter, compare the
511    /// number of characters, and either emit the `End` token above, or turn the `CandidateEnd` to a
512    /// `FalseEnd` otherwise
513    #[regex("\"%+")]
514    CandidateEnd(&'input str),
515
516    /// Same as `CandidateEnd`, but for interpolation
517    #[regex("%+\\{")]
518    CandidateInterpolation(&'input str),
519
520    /// Unfortunate consequence of Logos'
521    /// [issue#200](https://github.com/maciejhirsz/logos/issues/200). The other rules should be
522    /// sufficient to match this as a double quote followed by a `CandidateInterpolation`, but if we
523    /// omit this token, the lexer can fail unexpectedly on valid inputs because of #200.
524    #[regex("\"%+\\{")]
525    QuotesCandidateInterpolation(&'input str),
526
527    /// Token emitted by the modal lexer for the parser once it has decided that a `CandidateEnd` is
528    /// an actual end token.
529    End,
530
531    Interpolation,
532}
533
534/// The tokens of the modal lexer.
535#[derive(Debug, PartialEq, Clone)]
536pub enum Token<'input> {
537    Normal(NormalToken<'input>),
538    Str(StringToken<'input>),
539    MultiStr(MultiStringToken<'input>),
540}
541
542pub type SpannedToken<'input> = (usize, Token<'input>, usize);
543type NormalLexer<'input> = logos::Lexer<'input, NormalToken<'input>>;
544type StringLexer<'input> = logos::Lexer<'input, StringToken<'input>>;
545type MultiStringLexer<'input> = logos::Lexer<'input, MultiStringToken<'input>>;
546
547pub enum ModalLexer<'input> {
548    Normal {
549        mode_data: NormalData,
550        logos_lexer: NormalLexer<'input>,
551    },
552    String {
553        logos_lexer: StringLexer<'input>,
554    },
555    MultiString {
556        mode_data: MultiStrData,
557        /// A token that has been buffered and must be returned at the next call to `next()`.
558        /// Related to lexing a possible interpolation sequence, such as `%%%{`, which requires to
559        /// split a candidate interpolation token in two. In this case, we need to emit the first
560        /// token on the spot, and bufferize the second one, to be emitted on the following call to
561        /// `next()`.
562        buffer: Option<(MultiStringToken<'input>, Range<usize>)>,
563        logos_lexer: MultiStringLexer<'input>,
564    },
565}
566
567// Wrap the `next()` function of the underlying lexer.
568impl<'input> Iterator for ModalLexer<'input> {
569    type Item = Result<Token<'input>, ()>;
570
571    fn next(&mut self) -> Option<Self::Item> {
572        match self {
573            ModalLexer::Normal { logos_lexer, .. } => Some(logos_lexer.next()?.map(Token::Normal)),
574            ModalLexer::String { logos_lexer } => Some(logos_lexer.next()?.map(Token::Str)),
575            ModalLexer::MultiString { logos_lexer, .. } => {
576                Some(logos_lexer.next()?.map(Token::MultiStr))
577            }
578        }
579    }
580}
581
582/// State associated to the lexer in multiline string mode.
583#[derive(Clone, PartialEq, Eq, Debug)]
584pub struct MultiStrData {
585    /// The number of characters of the starting delimiter, required to correctly detect the end of
586    /// multiline strings.
587    percent_count: usize,
588    /// The position of the opening delimiter of the current multiline string. Used for error
589    /// reporting.
590    opening_delimiter: Range<usize>,
591}
592
593/// State associated to the lexer in normal mode.
594#[derive(Clone, PartialEq, Eq, Debug, Default)]
595pub struct NormalData {
596    /// The current brace counter to determine if a closing brace is the end of
597    /// an interpolated expression.
598    brace_count: usize,
599}
600
601impl NormalData {
602    pub fn new() -> Self {
603        Default::default()
604    }
605}
606
607/// Possible lexer modes together with their associated state. `Mode` values are pushed on a stack
608/// when entering a new mode and popped when a mode is exited. The associated mode data are
609/// restored when restoring a previous mode.
610#[derive(Clone, PartialEq, Eq, Debug)]
611pub enum Mode {
612    /// When lexing a normal (double quotes) string.
613    String,
614    /// When lexing a multiline string.
615    MultiString(MultiStrData),
616    /// When lexing a normal Nickel expression.
617    Normal(NormalData),
618}
619
620pub struct Lexer<'input> {
621    // We are forced to use an `Option` in order to be able to switch mode without cloning the
622    // underlying lexer. Logos offers a `morph()` function for a in-place conversion between
623    // lexers, that we want to use to transform a normal mode lexer to a string mode lexer. But
624    // Rust's borrowing system won't let us take ownership of the underlying lexer without
625    // replacing it first by something else, whence the `Option`. `lexer` should never be none
626    // excepted in an non observable intermediate state during mode switching.
627    /// The modal lexer.
628    pub lexer: Option<ModalLexer<'input>>,
629    /// The mode stack. Whenever a new mode is entered (starting to lex a string for example), the
630    /// previous mode together with its associated state is pushed on this stack. It can be then
631    /// restored once the current mode is exited (in the string example, when the string ends).
632    pub modes: Vec<Mode>,
633}
634
635impl<'input> Lexer<'input> {
636    pub fn new(s: &'input str) -> Self {
637        Lexer {
638            lexer: Some(ModalLexer::Normal {
639                mode_data: NormalData { brace_count: 0 },
640                logos_lexer: NormalToken::lexer(s),
641            }),
642            modes: Vec::new(),
643        }
644    }
645
646    fn enter_strlike<F>(&mut self, morph: F)
647    where
648        F: FnOnce(NormalLexer<'input>) -> ModalLexer<'input>,
649    {
650        match self.lexer.take() {
651            // Cannot transition from a string mode to another one, so the current mode must be
652            // `Normal`
653            Some(ModalLexer::Normal {
654                mode_data,
655                logos_lexer,
656            }) => {
657                self.modes.push(Mode::Normal(mode_data));
658                self.lexer = Some(morph(logos_lexer));
659            }
660            _ => panic!("lexer::enter_strlike"),
661        }
662    }
663
664    fn enter_str(&mut self) {
665        self.enter_strlike(|lexer| ModalLexer::String {
666            logos_lexer: lexer.morph(),
667        });
668    }
669
670    fn enter_indstr(&mut self, percent_count: usize, opening_delimiter: Range<usize>) {
671        self.enter_strlike(|lexer| ModalLexer::MultiString {
672            mode_data: MultiStrData {
673                percent_count,
674                opening_delimiter,
675            },
676            buffer: None,
677            logos_lexer: lexer.morph(),
678        });
679    }
680
681    fn enter_normal(&mut self) {
682        match self.lexer.take() {
683            Some(ModalLexer::String { logos_lexer }) => {
684                self.lexer = Some(ModalLexer::Normal {
685                    mode_data: NormalData::new(),
686                    logos_lexer: logos_lexer.morph(),
687                });
688
689                self.modes.push(Mode::String);
690            }
691            Some(ModalLexer::MultiString {
692                mode_data,
693                logos_lexer,
694                buffer: _,
695            }) => {
696                self.lexer = Some(ModalLexer::Normal {
697                    mode_data: NormalData::new(),
698                    logos_lexer: logos_lexer.morph(),
699                });
700
701                self.modes.push(Mode::MultiString(mode_data));
702            }
703            _ => panic!("lexer::enter_normal"),
704        }
705    }
706
707    fn leave_str(&mut self) {
708        match self.lexer.take() {
709            Some(ModalLexer::String { logos_lexer }) => {
710                // We can only enter string mode from normal mode
711                let Some(Mode::Normal(mode_data)) = self.modes.pop() else {
712                    panic!("lexer::leave_str (popped wrong mode)");
713                };
714
715                self.lexer = Some(ModalLexer::Normal {
716                    mode_data,
717                    logos_lexer: logos_lexer.morph(),
718                });
719            }
720            _ => panic!("lexer::leave_str"),
721        }
722    }
723
724    fn leave_indstr(&mut self) {
725        match self.lexer.take() {
726            Some(ModalLexer::MultiString { logos_lexer, .. }) => {
727                // We can only enter string mode from normal mode
728                let Some(Mode::Normal(data)) = self.modes.pop() else {
729                    panic!("lexer::leave_str (popped wrong mode)");
730                };
731
732                self.lexer = Some(ModalLexer::Normal {
733                    mode_data: data,
734                    logos_lexer: logos_lexer.morph(),
735                });
736            }
737            _ => panic!("lexer::leave_str"),
738        }
739    }
740
741    fn leave_normal(&mut self) {
742        match self.lexer.take() {
743            Some(ModalLexer::Normal { logos_lexer, .. }) => {
744                match self.modes.pop() {
745                    Some(Mode::String) => {
746                        self.lexer = Some(ModalLexer::String {
747                            logos_lexer: logos_lexer.morph(),
748                        })
749                    }
750                    Some(Mode::MultiString(data)) => {
751                        self.lexer = Some(ModalLexer::MultiString {
752                            mode_data: data,
753                            buffer: None,
754                            logos_lexer: logos_lexer.morph(),
755                        })
756                    }
757                    mode => panic!("lexer::leave_normal (popped mode {mode:?})"),
758                };
759            }
760            _ => panic!("lexer::leave_normal"),
761        }
762    }
763
764    /// Split a candidate interpolation token into a string literal and an interpolation token. Put
765    /// the interpolation token in the buffer to be popped later, and return the literal as the
766    /// next token.
767    ///
768    /// # Precondition
769    ///
770    /// - this function requires `s.len() >= self.count`, or will panic.
771    fn split_candidate_interp(
772        &mut self,
773        s: &'input str,
774        span: Range<usize>,
775        percent_count: usize,
776    ) -> (Token<'input>, Range<usize>) {
777        let split_at = s.len() - percent_count;
778        let next_token = MultiStringToken::Interpolation;
779        let next_span = Range {
780            start: span.start + split_at,
781            end: span.end,
782        };
783        self.bufferize(next_token, next_span);
784
785        let token = Token::MultiStr(MultiStringToken::Literal(s[0..split_at].to_owned()));
786        let span = Range {
787            start: span.start,
788            end: span.start + split_at,
789        };
790
791        (token, span)
792    }
793
794    // Handle a normal token, updating the mode data if necessary.
795    fn handle_normal_token(
796        &mut self,
797        span: Range<usize>,
798        token: NormalToken<'input>,
799    ) -> Option<Result<SpannedToken<'input>, LexicalError>> {
800        match token {
801            NormalToken::DoubleQuote | NormalToken::StrEnumTagBegin => self.enter_str(),
802            NormalToken::MultiStringStart(delim_size)
803            | NormalToken::SymbolicStringStart(SymbolicStringStart {
804                length: delim_size, ..
805            }) => {
806                // for interpolation & closing delimiters we only care about
807                // the number of `%`s (plus the opening `"` or `{`) so we
808                // drop the "kind marker" size here (i.e. the `m` character).
809                let size_without_kind_marker = delim_size - 1;
810                self.enter_indstr(size_without_kind_marker, span.clone())
811            }
812            NormalToken::LBrace => {
813                self.normal_mode_data_mut().brace_count += 1;
814            }
815            NormalToken::RBrace => {
816                let data = self.normal_mode_data_mut();
817                if data.brace_count == 0 {
818                    if self.modes.is_empty() {
819                        return Some(Err(LexicalError::UnmatchedCloseBrace(span.start)));
820                    }
821
822                    self.leave_normal();
823                } else {
824                    data.brace_count -= 1;
825                }
826            }
827            // Ignore comment
828            NormalToken::LineComment => return self.next(),
829            NormalToken::Error => {
830                return Some(Err(LexicalError::Generic(span)));
831            }
832            _ => (),
833        };
834
835        Some(Ok((span.start, Token::Normal(token), span.end)))
836    }
837
838    // Handle a string token. This method currently doesn't have any side effect, as in string
839    // mode, there's no state to update.
840    fn handle_string_token(
841        &mut self,
842        span: Range<usize>,
843        token: StringToken<'input>,
844    ) -> Option<Result<SpannedToken<'input>, LexicalError>> {
845        let result = match token {
846            StringToken::DoubleQuote => {
847                self.leave_str();
848                // To make things simpler on the parser side, we only return one variant for
849                // `DoubleQuote`, namely the the normal one.
850                Token::Normal(NormalToken::DoubleQuote)
851            }
852            tok @ StringToken::Interpolation => {
853                self.enter_normal();
854                Token::Str(tok)
855            }
856            // Convert escape sequences to the corresponding character.
857            StringToken::EscapedChar(c) => {
858                if let Some(esc) = escape_char(c) {
859                    Token::Str(StringToken::EscapedChar(esc))
860                } else {
861                    return Some(Err(LexicalError::InvalidEscapeSequence(span.start + 1)));
862                }
863            }
864            StringToken::EscapedAscii(code) => {
865                if let Some(esc) = escape_ascii(code) {
866                    Token::Str(StringToken::EscapedChar(esc))
867                } else {
868                    return Some(Err(LexicalError::InvalidAsciiEscapeCode(span.start + 2)));
869                }
870            }
871            StringToken::EscapedUnicode(code) => {
872                if let Some(esc) = escape_unicode(code) {
873                    Token::Str(StringToken::EscapedChar(esc))
874                } else {
875                    let start = span.start + 3;
876                    let end = start + code.len();
877                    return Some(Err(LexicalError::InvalidUnicodeEscapeCode(start..end)));
878                }
879            }
880            StringToken::Error => {
881                return Some(Err(LexicalError::Generic(span)));
882            }
883            token => Token::Str(token),
884        };
885
886        Some(Ok((span.start, result, span.end)))
887    }
888
889    // Handle a multistring token. Might push a token inside the buffer.
890    fn handle_multistr_token(
891        &mut self,
892        mut span: Range<usize>,
893        token: MultiStringToken<'input>,
894    ) -> Option<Result<SpannedToken<'input>, LexicalError>> {
895        let data = self.multistring_mode_data();
896
897        let result = match token {
898            // If we encounter a `CandidateInterp` token with the right number of characters, this
899            // is an interpolation sequence.
900            //
901            // Note that the number of characters may be greater than `count`: in `m%" %%%{foo} "%`,
902            // the lexer will process `%%%{` as a candidate interpolation with 4 characters, while
903            // `count` is 2. In that case, we must emit a `%%` literal and put an interpolation
904            // token in the buffer.
905            MultiStringToken::CandidateInterpolation(s) if s.len() >= data.percent_count => {
906                if s.len() == data.percent_count {
907                    self.enter_normal();
908                    Token::MultiStr(MultiStringToken::Interpolation)
909                } else {
910                    let (token_fst, span_fst) =
911                        self.split_candidate_interp(s, span, data.percent_count);
912                    span = span_fst;
913                    token_fst
914                }
915            }
916            // We never lex something as a `MultiStringToken::Interpolation` directly, but rather
917            // generate it in this very function from other tokens. However, such a token could
918            // have still been buffered in the previous iteration, and can thus be matched here,
919            // which is why we need the case below.
920            tok @ MultiStringToken::Interpolation => {
921                self.enter_normal();
922                Token::MultiStr(tok)
923            }
924            // If we encounter a `QuotesCandidateInterpolation` token with as many `%` characters
925            // as the current count or more, we need to split it into two tokens:
926            //
927            // - a string literal corresponding to the `"` followed by `(s.len() - self.count)`
928            // `%`s
929            // - an interpolation token
930            //
931            // The interpolation token is put in the buffer to returned next time.
932            //
933            // For example, in `m%""%%{exp}"%`, the `"%%{` is a `QuotesCandidateInterpolation`
934            // which is split as a `"%` literal followed by an interpolation token.
935            MultiStringToken::QuotesCandidateInterpolation(s) if s.len() > data.percent_count => {
936                let (token_fst, span_fst) =
937                    self.split_candidate_interp(s, span, data.percent_count);
938                span = span_fst;
939                token_fst
940            }
941            // Otherwise, it is just part of the string, so we transform the token into a
942            // `Literal` one
943            MultiStringToken::CandidateInterpolation(s)
944            | MultiStringToken::QuotesCandidateInterpolation(s) => {
945                Token::MultiStr(MultiStringToken::Literal(s.to_owned()))
946            }
947            // Strictly speaking, a candidate end delimiter with more than the required count of
948            // `%` should be split between multistring end token, plus a variable number of `%`
949            // tokens. This is annoying because we only buffer one token currently. We could use a
950            // stack instead of a 1-length buffer, but in practice a string such as `m%" "%%` is
951            // almost surely meaningless: there's no meaningful way of interpreting it
952            // (although according to the grammar, it might be valid as a string followed by a
953            // modulo operator `%` - which will fail anyway at runtime with a type error).
954            // Thus, we prefer to emit a proper error right here.
955            MultiStringToken::CandidateEnd(s) if s.len() > data.percent_count => {
956                return Some(Err(LexicalError::StringDelimiterMismatch {
957                    opening_delimiter: data.opening_delimiter.clone(),
958                    closing_delimiter: span,
959                }));
960            }
961            // If we encounter a `CandidateEnd` token with the same number of `%`s as the
962            // starting token then it is the end of a multiline string
963            MultiStringToken::CandidateEnd(s) if s.len() == data.percent_count => {
964                self.leave_indstr();
965                Token::MultiStr(MultiStringToken::End)
966            }
967            // Otherwise, it is just part of the string, so we transform the token into a
968            // `Literal` one
969            MultiStringToken::CandidateEnd(s) => {
970                Token::MultiStr(MultiStringToken::Literal(s.to_owned()))
971            }
972            // Early report errors for now. This could change in the future
973            MultiStringToken::Error => {
974                return Some(Err(LexicalError::Generic(span)));
975            }
976            token => Token::MultiStr(token),
977        };
978
979        Some(Ok((span.start, result, span.end)))
980    }
981
982    // WARNING: this method expects the lexer to be in normal mode. Panics otherwise.
983    // Ideally, we wouldn't have to match on `self.lexer` again and have this (hopefully)
984    // unreachable `panic!`. In practice, the fact that `handle_normal_token` might both mutate
985    // `mode_data` or switch mode (and thus get rid of the current lexer, which holds mode_data)
986    // altogether makes it hard to do something that is both ergonomic and satisfies the borrow
987    // checker.
988    // We initially tried to thread `data` through `handle_normal_token`, but this not only
989    // requires to clone the data to avoid multiple mutable borrows to `self`, but also had a
990    // subtly wrong behavior because when reaching a comment, we call `self.next()`, and threading
991    // data properly becomes non trivial.
992    fn normal_mode_data_mut(&mut self) -> &mut NormalData {
993        match self.lexer {
994            Some(ModalLexer::Normal {
995                ref mut mode_data, ..
996            }) => mode_data,
997            _ => panic!("lexer: normal_mode_data() called while not in normal mode"),
998        }
999    }
1000
1001    fn multistring_mode_data(&self) -> &MultiStrData {
1002        match self.lexer {
1003            Some(ModalLexer::MultiString { ref mode_data, .. }) => mode_data,
1004            _ => panic!("lexer: multistring_mode_data() called while not in multistring mode"),
1005        }
1006    }
1007
1008    // WARNING: this method expects the lexer to be in multistring mode. Panics otherwise.
1009    fn bufferize(&mut self, token: MultiStringToken<'input>, span: Range<usize>) {
1010        match self.lexer {
1011            Some(ModalLexer::MultiString { ref mut buffer, .. }) => *buffer = Some((token, span)),
1012            _ => panic!("lexer: bufferize() called while not in normal mode"),
1013        }
1014    }
1015}
1016
1017impl<'input> Iterator for Lexer<'input> {
1018    type Item = Result<SpannedToken<'input>, LexicalError>;
1019
1020    fn next(&mut self) -> Option<Self::Item> {
1021        match self.lexer.as_mut().unwrap() {
1022            ModalLexer::Normal { logos_lexer, .. } => {
1023                let normal_token = logos_lexer.next()?.unwrap_or(NormalToken::Error);
1024                let span = logos_lexer.span();
1025                self.handle_normal_token(span, normal_token)
1026            }
1027            ModalLexer::String { logos_lexer } => {
1028                let string_token = logos_lexer.next()?.unwrap_or(StringToken::Error);
1029                let span = logos_lexer.span();
1030                self.handle_string_token(span, string_token)
1031            }
1032            ModalLexer::MultiString {
1033                buffer,
1034                logos_lexer,
1035                ..
1036            } => {
1037                let (multistr_token, span) = buffer.take().or_else(|| {
1038                    Some((
1039                        logos_lexer.next()?.unwrap_or(MultiStringToken::Error),
1040                        logos_lexer.span(),
1041                    ))
1042                })?;
1043
1044                self.handle_multistr_token(span, multistr_token)
1045            }
1046        }
1047    }
1048}
1049
1050/// Lexer that offsets all the byte indices by a given constant. This is useful when reparsing a
1051/// slice of the original input while keeping positions relative to the entire original input.
1052pub struct OffsetLexer<'input> {
1053    lexer: Lexer<'input>,
1054    offset: usize,
1055}
1056
1057impl<'input> OffsetLexer<'input> {
1058    pub fn new(s: &'input str, offset: usize) -> Self {
1059        OffsetLexer {
1060            lexer: Lexer::new(s),
1061            offset,
1062        }
1063    }
1064}
1065
1066impl<'input> Iterator for OffsetLexer<'input> {
1067    type Item = Result<SpannedToken<'input>, LexicalError>;
1068
1069    fn next(&mut self) -> Option<Self::Item> {
1070        self.lexer.next().map(|result| {
1071            result.map(|(start, tok, end)| (start + self.offset, tok, end + self.offset))
1072        })
1073    }
1074}
1075
1076/// Generate the character corresponding to an escape char.
1077fn escape_char(chr: char) -> Option<char> {
1078    match chr {
1079        '\'' => Some('\''),
1080        '"' => Some('"'),
1081        '\\' => Some('\\'),
1082        '%' => Some('%'),
1083        'n' => Some('\n'),
1084        'r' => Some('\r'),
1085        't' => Some('\t'),
1086        _ => None,
1087    }
1088}
1089
1090/// Generate the character corresponding to an ASCII escape sequence.
1091///
1092/// # Arguments
1093/// - `code`: a string representation of the ASCII code in hexadecimal
1094fn escape_ascii(code: &str) -> Option<char> {
1095    let code = u8::from_str_radix(code, 16).ok()?;
1096    if code > 0x7F {
1097        None
1098    } else {
1099        Some(code as char)
1100    }
1101}
1102
1103fn escape_unicode(code: &str) -> Option<char> {
1104    u32::from_str_radix(code, 16).ok().and_then(char::from_u32)
1105}
1106
1107/// Normalize the line endings in `s` to only `\n` and, in debug mode, check
1108/// for lone `\r` without an accompanying `\n`.
1109pub fn normalize_line_endings(s: impl AsRef<str>) -> String {
1110    let normalized = s.as_ref().replace("\r\n", "\n");
1111    debug_assert!(
1112        normalized.find('\r').is_none(),
1113        "The lexer throws an error when it finds a lone carriage return"
1114    );
1115    normalized
1116}
nickel_lang_parser/lexer.rs

nickel_lang_parser/
lexer.rs