nickel_lang_core/parser/
utils.rs

1//! Various helpers and companion code for the parser are put here to keep the grammar definition
2//! uncluttered.
3use std::{
4    ffi::OsString,
5    iter,
6    {collections::HashSet, fmt::Debug},
7};
8
9use super::error::ParseError;
10
11use crate::{
12    app,
13    bytecode::ast::{
14        pattern::bindings::Bindings as _,
15        record::{FieldDef, FieldMetadata, Include},
16        *,
17    },
18    cache::InputFormat,
19    combine::CombineAlloc,
20    eval::merge::merge_doc,
21    files::FileId,
22    fun,
23    identifier::LocIdent,
24    position::{RawSpan, TermPos},
25    primop_app,
26};
27
28use malachite::{
29    base::num::conversion::traits::{FromSciString, FromStringBase},
30    Integer,
31};
32
33pub struct ParseNumberError;
34
35pub fn parse_number_sci(slice: &str) -> Result<Number, ParseNumberError> {
36    Number::from_sci_string(slice).ok_or(ParseNumberError)
37}
38
39pub fn parse_number_base(base: u8, slice: &str) -> Result<Number, ParseNumberError> {
40    Ok(Number::from(
41        Integer::from_string_base(base, slice).ok_or(ParseNumberError)?,
42    ))
43}
44
45/// Distinguish between the standard string opening delimiter `"`, the multi-line string
46/// opening delimter `m%"`, and the symbolic string opening delimiter `s%"`.
47#[derive(Copy, Clone, Eq, PartialEq, Debug)]
48pub enum StringStartDelimiter<'input> {
49    Standard,
50    Multiline,
51    Symbolic(&'input str),
52}
53
54impl StringStartDelimiter<'_> {
55    pub fn is_closed_by(&self, close: &StringEndDelimiter) -> bool {
56        matches!(
57            (self, close),
58            (StringStartDelimiter::Standard, StringEndDelimiter::Standard)
59                | (StringStartDelimiter::Multiline, StringEndDelimiter::Special)
60                | (
61                    StringStartDelimiter::Symbolic(_),
62                    StringEndDelimiter::Special
63                )
64        )
65    }
66
67    pub fn needs_strip_indent(&self) -> bool {
68        match self {
69            StringStartDelimiter::Standard => false,
70            StringStartDelimiter::Multiline | StringStartDelimiter::Symbolic(_) => true,
71        }
72    }
73}
74
75/// Distinguish between the standard string closing delimiter `"` and the "special" string
76/// closing delimiter `"%`.
77#[derive(Copy, Clone, Debug, Eq, PartialEq)]
78pub enum StringEndDelimiter {
79    Standard,
80    Special,
81}
82
83/// A string chunk literal atom, being either a string or a single char.
84///
85/// Because of the way the lexer handles escaping and interpolation, a contiguous static string
86/// `"Some \\ \%{escaped} string"` will be lexed as a sequence of such atoms.
87#[derive(Clone, Debug, Eq, PartialEq)]
88pub enum ChunkLiteralPart {
89    Str(String),
90    Char(char),
91}
92
93/// The last field of a record, that can either be a normal field declaration or an ellipsis.
94#[allow(
95    clippy::large_enum_variant,
96    reason = "the large variant is the common one"
97)]
98#[derive(Clone, Debug)]
99pub enum LastField<'ast> {
100    FieldDecl(FieldDecl<'ast>),
101    Ellipsis,
102}
103
104/// A record field declaration, that can be either a standard field definition or an `include`
105/// expression.
106#[derive(Clone, Debug)]
107pub enum FieldDecl<'ast> {
108    Def(FieldDef<'ast>),
109    Include(Include<'ast>),
110    IncludeList(Vec<Include<'ast>>),
111}
112
113/// The last match in a data structure pattern. This can either be a normal match, or an ellipsis
114/// which can capture the rest of the data structure. The type parameter `P` is the type of the
115/// pattern of the data structure (ellipsis are supported for both array and record patterns).
116///
117/// # Example
118///
119/// - In `{foo={}, bar}`, the last match is an normal match.
120/// - In `{foo={}, bar, ..}`, the last match is a non-capturing ellipsis.
121/// - In `{foo={}, bar, ..rest}`, the last match is a capturing ellipsis.
122#[derive(Debug, PartialEq, Clone)]
123pub enum LastPattern<P> {
124    /// The last field is a normal match. In this case the pattern is "closed" so every record
125    /// fields should be matched.
126    Normal(P),
127    /// The pattern is "open" `, ..}`. Optionally you can bind a record containing the remaining
128    /// fields to an `Identifier` using the syntax `, ..y}`.
129    Ellipsis(Option<LocIdent>),
130}
131
132/// Trait for operators that can be eta-expanded to a function.
133pub(super) trait EtaExpand {
134    /// Eta-expand an operator. This wraps an operator, for example `==`, as a function `fun x1 x2
135    /// => x1 == x2`. Propagate the position of the curried operator to the generated primop apps
136    /// for better error reporting.
137    fn eta_expand(self, alloc: &AstAlloc, pos: TermPos) -> Node<'_>;
138}
139
140/// An infix operator that is not applied. Used for the curried operator syntax (e.g `(==)`)
141pub(super) struct InfixOp(pub(super) primop::PrimOp);
142
143impl EtaExpand for InfixOp {
144    fn eta_expand(self, alloc: &AstAlloc, pos: TermPos) -> Node<'_> {
145        // We could use `LocIdent::fresh` for the  newly introduced function parameters. However,
146        // it has the issue that pretty printing them doesn't result in valid Nickel anymore. This
147        // is why we prefer normal identifier like `x` or `y`.
148        match self {
149            // We treat `UnaryOp::BoolAnd` and `UnaryOp::BoolOr` separately.
150            //
151            // They are unary operators taking a second lazy argument, but the current mainine
152            // evaluator expects that they are always fully applied (including to their argument).
153            // That is, Nickel currently doesn't support a partial application like `%bool_or%
154            // <arg1>` (which is fine, because the latter isn't actually representable in the
155            // source language: `BoolOr` is only expressible through the infix syntax `<arg1> ||
156            // <arg2>`). Thus, instead of eta-expanding to `fun x => <op> x` as we would for other
157            // unary operators, we eta-expand to `fun x1 x2 => <op> x1 x2`.
158            InfixOp(op @ primop::PrimOp::BoolAnd) | InfixOp(op @ primop::PrimOp::BoolOr) => {
159                let fst_arg = LocIdent::from("x");
160                let snd_arg = LocIdent::from("y");
161
162                fun!(
163                    alloc,
164                    fst_arg,
165                    snd_arg,
166                    app!(
167                        alloc,
168                        primop_app!(alloc, op, builder::var(fst_arg)),
169                        builder::var(snd_arg),
170                    )
171                    .with_pos(pos),
172                )
173                .node
174            }
175            // `RecordGet field record` corresponds to `record."%{field}"`. Using the curried
176            // version `(.)` has thus reversed argument corresponding to the `RecordGet` primop, so
177            // we need to flip them.
178            InfixOp(op @ primop::PrimOp::RecordGet) => {
179                let fst_arg = LocIdent::new("x");
180                let snd_arg = LocIdent::new("y");
181
182                fun!(
183                    alloc,
184                    fst_arg,
185                    snd_arg,
186                    primop_app!(alloc, op, builder::var(snd_arg), builder::var(fst_arg))
187                        .with_pos(pos),
188                )
189                .node
190            }
191            InfixOp(op) => {
192                let vars: Vec<_> = (0..op.arity())
193                    .map(|i| LocIdent::from(format!("x{i}")))
194                    .collect();
195                let fun_args: Vec<_> = vars.iter().map(|arg| pattern::Pattern::any(*arg)).collect();
196                let args: Vec<_> = vars.into_iter().map(builder::var).collect();
197
198                alloc.fun(fun_args, alloc.prim_op(op, args).spanned(pos))
199            }
200        }
201    }
202}
203
204/// Additional infix operators that aren't proper primitive operations in the Nickel AST but are
205/// still available in the surface syntax (and desugared at parsing time). They can still be used
206/// in a curried form so they need a wrapper and an `EtaExpand` implementation.
207pub(super) enum ExtendedInfixOp {
208    /// The reverse application operation or pipe operator `|>`.
209    ReverseApp,
210    /// The inequality operator `!=`.
211    NotEqual,
212}
213
214impl EtaExpand for ExtendedInfixOp {
215    fn eta_expand(self, alloc: &AstAlloc, pos: TermPos) -> Node<'_> {
216        match self {
217            ExtendedInfixOp::ReverseApp => {
218                let fst_arg = LocIdent::from("x");
219                let snd_arg = LocIdent::from("y");
220
221                fun!(
222                    alloc,
223                    fst_arg,
224                    snd_arg,
225                    app!(alloc, builder::var(snd_arg), builder::var(fst_arg)).with_pos(pos),
226                )
227                .node
228            }
229            ExtendedInfixOp::NotEqual => {
230                let fst_arg = LocIdent::from("x");
231                let snd_arg = LocIdent::from("y");
232
233                fun!(
234                    alloc,
235                    fst_arg,
236                    snd_arg,
237                    primop_app!(
238                        alloc,
239                        primop::PrimOp::BoolNot,
240                        primop_app!(
241                            alloc,
242                            primop::PrimOp::Eq,
243                            builder::var(fst_arg),
244                            builder::var(snd_arg),
245                        )
246                        .with_pos(pos),
247                    )
248                    .with_pos(pos),
249                )
250                .node
251            }
252        }
253    }
254}
255
256/// Trait for structures representing annotations which can be combined with a term to build
257/// another term, or another structure holding a term, such as a field. `T` is the said target
258/// structure.
259pub trait AttachToAst<'ast, T> {
260    fn attach_to_ast(self, alloc: &'ast AstAlloc, ast: Ast<'ast>) -> T;
261}
262
263impl<'ast> CombineAlloc<'ast> for FieldMetadata<'ast> {
264    /// Combine two field metadata into one. If data that can't be combined (typically, the
265    /// documentation or the type annotation) are set by both, the left one's are kept.
266    fn combine(alloc: &'ast AstAlloc, left: Self, right: Self) -> Self {
267        let priority = match (left.priority, right.priority) {
268            // Neutral corresponds to the case where no priority was specified. In that case, the
269            // other priority takes precedence.
270            (MergePriority::Neutral, p) | (p, MergePriority::Neutral) => p,
271            // Otherwise, we keep the maximum of both priorities, as we would do when merging
272            // values.
273            (p1, p2) => std::cmp::max(p1, p2),
274        };
275
276        FieldMetadata {
277            doc: merge_doc(left.doc, right.doc),
278            annotation: CombineAlloc::combine(alloc, left.annotation, right.annotation),
279            opt: left.opt || right.opt,
280            // The resulting field will be suppressed from serialization if either of the fields to be merged is.
281            not_exported: left.not_exported || right.not_exported,
282            priority,
283        }
284    }
285}
286
287impl<'ast> CombineAlloc<'ast> for LetMetadata<'ast> {
288    /// Combine two let metadata into one. Same as `FieldMetadata::combine` but restricted to the
289    /// metadata that can be associated to a let block.
290    fn combine(alloc: &'ast AstAlloc, left: Self, right: Self) -> Self {
291        LetMetadata {
292            doc: merge_doc(left.doc, right.doc),
293            annotation: CombineAlloc::combine(alloc, left.annotation, right.annotation),
294        }
295    }
296}
297
298impl<'ast> CombineAlloc<'ast> for Annotation<'ast> {
299    /// Combine two annotations. If both have `types` set, the final type
300    /// is the one of the left annotation, while the right one's type is put
301    /// inside the final `contracts`.
302    ///
303    /// Contracts are combined from left to right; the left one's are put first,
304    /// then maybe the right one's type annotation and then the right one's
305    /// contracts.
306    fn combine(alloc: &'ast AstAlloc, left: Self, right: Self) -> Self {
307        let (typ, leftover) = match (left.typ, right.typ) {
308            (left_ty @ Some(_), right_ty @ Some(_)) => (left_ty, right_ty),
309            (left_ty, right_ty) => (left_ty.or(right_ty), None),
310        };
311
312        let contracts: Vec<_> = left
313            .contracts
314            .iter()
315            .cloned()
316            .chain(leftover)
317            .chain(right.contracts.iter().cloned())
318            .collect();
319
320        alloc.annotation(typ, contracts)
321    }
322}
323
324impl<'ast> AttachToAst<'ast, Ast<'ast>> for Annotation<'ast> {
325    fn attach_to_ast(self, alloc: &'ast AstAlloc, ast: Ast<'ast>) -> Ast<'ast> {
326        if self.is_empty() {
327            return ast;
328        }
329
330        let pos = ast.pos;
331        Ast {
332            node: alloc.annotated(self, ast),
333            pos,
334        }
335    }
336}
337
338/// Takes a record access written as `foo."<access>"`, and either turn it into a static access
339/// whenever possible (when `<access>` is a static string without interpolation), or into a dynamic
340/// `%record/get%` access otherwise.
341pub fn mk_access<'ast>(alloc: &'ast AstAlloc, access: Ast<'ast>, root: Ast<'ast>) -> Node<'ast> {
342    if let Some(label) = access.node.try_str_chunk_as_static_str() {
343        alloc.prim_op(
344            primop::PrimOp::RecordStatAccess(LocIdent::new_with_pos(label, access.pos)),
345            iter::once(root),
346        )
347    } else {
348        alloc.prim_op(primop::PrimOp::RecordGet, [access, root])
349    }
350}
351
352/// Make a span from parser byte offsets.
353pub fn mk_span(src_id: FileId, l: usize, r: usize) -> RawSpan {
354    RawSpan {
355        src_id,
356        start: (l as u32).into(),
357        end: (r as u32).into(),
358    }
359}
360
361pub fn mk_pos(src_id: FileId, l: usize, r: usize) -> TermPos {
362    TermPos::Original(mk_span(src_id, l, r))
363}
364
365/// Checks that there are no duplicate bindings in a let block (when bindins are simple, that is
366/// they aren't pattern), and builds the corresponding let block node if the check passes.
367pub fn mk_let<'ast>(
368    alloc: &'ast AstAlloc,
369    rec: bool,
370    bindings: Vec<LetBinding<'ast>>,
371    body: Ast<'ast>,
372) -> Result<Node<'ast>, ParseError> {
373    // Check for duplicate names across the different bindings. We
374    // don't check for duplicate names within a single binding because
375    // there are backwards-compatibility constraints (e.g., see
376    // `RecordPattern::check_dup`).
377    let mut seen_bindings: HashSet<LocIdent> = HashSet::new();
378
379    for b in &bindings {
380        let new_bindings = b.pattern.bindings();
381        for binding in &new_bindings {
382            if let Some(old) = seen_bindings.get(&binding.id) {
383                return Err(ParseError::DuplicateIdentInLetBlock {
384                    ident: binding.id,
385                    prev_ident: *old,
386                });
387            }
388        }
389
390        seen_bindings.extend(new_bindings.into_iter().map(|binding| binding.id));
391    }
392
393    Ok(alloc.let_block(bindings, body, rec))
394}
395
396pub fn mk_import_based_on_filename(
397    alloc: &AstAlloc,
398    path: String,
399    _span: RawSpan,
400) -> Result<Node<'_>, ParseError> {
401    let path = OsString::from(path);
402    let format: Option<InputFormat> = InputFormat::from_path(&path);
403
404    // Fall back to InputFormat::Nickel in case of unknown filename extension for backwards compatiblilty.
405    let format = format.unwrap_or_default();
406
407    Ok(alloc.import_path(path, format))
408}
409
410pub fn mk_import_explicit(
411    alloc: &AstAlloc,
412    path: String,
413    format: LocIdent,
414    span: RawSpan,
415) -> Result<Node<'_>, ParseError> {
416    let path = OsString::from(path);
417    let Ok(format) = format.label().parse::<InputFormat>() else {
418        return Err(ParseError::InvalidImportFormat { span });
419    };
420
421    Ok(alloc.import_path(path, format))
422}
423
424/// Determine the minimal level of indentation of a multi-line string.
425///
426/// The result is determined by computing the minimum indentation level among all lines, where the
427/// indentation level of a line is the number of consecutive whitespace characters, which are
428/// either a space or a tab, counted from the beginning of the line. If a line is empty or consist
429/// only of whitespace characters, it is ignored.
430pub fn min_indent(chunks: &[StringChunk<Ast<'_>>]) -> usize {
431    let mut min: usize = usize::MAX;
432    let mut current = 0;
433    let mut start_line = true;
434
435    for chunk in chunks.iter() {
436        match chunk {
437            StringChunk::Expr(_, _) if start_line => {
438                if current < min {
439                    min = current;
440                }
441                start_line = false;
442            }
443            StringChunk::Expr(_, _) => (),
444            StringChunk::Literal(s) => {
445                for c in s.chars() {
446                    match c {
447                        ' ' | '\t' if start_line => current += 1,
448                        '\n' => {
449                            current = 0;
450                            start_line = true;
451                        }
452                        _ if start_line => {
453                            if current < min {
454                                min = current;
455                            }
456                            start_line = false;
457                        }
458                        _ => (),
459                    }
460                }
461            }
462        }
463    }
464
465    min
466}
467
468/// Strip the common indentation prefix from a multi-line string.
469///
470/// Determine the minimum indentation level of a multi-line string via [`min_indent`], and strip an
471/// equal number of whitespace characters (` ` or `\t`) from the beginning of each line. If the last
472/// line is empty or consist only of whitespace characters, it is filtered out.
473///
474/// The indentation of interpolated expressions in a multi-line string follow the rules:
475/// - if an interpolated expression is alone on a line with whitespaces, its indentation -- minus
476///   the common minimal indentation -- is stored and when the expression will be evaluated, each
477///   new line will be prepended with this indentation level.
478/// - if there are other non whitespace characters or interpolated expressions on the line, then it
479///   is just replaced by its content. The common indentation is still stripped before the start of
480///   this expression, but newlines inside it won't be affected..
481///
482/// Examples:
483///
484/// ```text
485/// let x = "I\nam\nindented" in
486/// m%"
487///   baseline
488///     ${x}
489///   end
490/// "%
491/// ```
492///
493/// gives
494///
495/// ```text
496///"baseline
497///  I
498///  am
499///  indented
500/// end"
501/// ```
502///
503/// While
504///
505/// ```text
506/// let x = "I\nam\nnot" in
507/// m%"
508///   baseline
509///     ${x} sth
510///   end
511/// "%
512/// ```
513///
514/// gives
515///
516/// ```text
517///"baseline
518///  I
519///am
520///not sth
521/// end"
522/// ```
523pub fn strip_indent(chunks: &mut [StringChunk<Ast<'_>>]) {
524    if chunks.is_empty() {
525        return;
526    }
527
528    let min = min_indent(chunks);
529    let mut current = 0;
530    let mut start_line = true;
531    let chunks_len = chunks.len();
532
533    // When processing a line with an indented interpolated expression, as in:
534    //
535    // ```
536    // m%"
537    //  some
538    //    ${x} ${y}
539    //    ${x}
540    //  string
541    // "%
542    // ```
543    //
544    // We don't know at the time we process the expression `${x}` if it wil have to be re-indented,
545    // as it depends on the rest of the line being only whitespace or not, according to the
546    // indentation rule. Here, the first occurrence should not, while the second one should. We can
547    // only know this once we process the next chunks, here when arriving at `${y}`. To handle
548    // this, we set all indentation levels as if expressions were alone on their line during the
549    // main loop, but also store the index of such chunks which indentation level must be revisited
550    // once the information becomes available. Then, their indentation level is erased in a last
551    // pass.
552    let mut unindent: Vec<usize> = Vec::new();
553    let mut expr_on_line: Option<usize> = None;
554
555    for (index, chunk) in chunks.iter_mut().enumerate() {
556        match chunk {
557            StringChunk::Literal(ref mut s) => {
558                let mut buffer = String::new();
559                for c in s.chars() {
560                    match c {
561                        ' ' | '\t' if start_line && current < min => current += 1,
562                        ' ' | '\t' if start_line => {
563                            current += 1;
564                            buffer.push(c);
565                        }
566                        '\n' => {
567                            current = 0;
568                            start_line = true;
569                            expr_on_line = None;
570                            buffer.push(c);
571                        }
572                        c if start_line => {
573                            start_line = false;
574                            buffer.push(c);
575                        }
576                        c => buffer.push(c),
577                    }
578                }
579
580                // Strip the first line, if it is only whitespace characters
581                if index == 0 {
582                    if let Some(first_index) = buffer.find('\n') {
583                        if first_index == 0
584                            || buffer.as_bytes()[..first_index]
585                                .iter()
586                                .all(|c| *c == b' ' || *c == b'\t')
587                        {
588                            buffer = String::from(&buffer[(first_index + 1)..]);
589                        }
590                    }
591                }
592
593                // Strip the last line, if it is only whitespace characters.
594                if index == chunks_len - 1 {
595                    if let Some(last_index) = buffer.rfind('\n') {
596                        if last_index == buffer.len() - 1
597                            || buffer.as_bytes()[(last_index + 1)..]
598                                .iter()
599                                .all(|c| *c == b' ' || *c == b'\t')
600                        {
601                            buffer.truncate(last_index);
602                        }
603                    }
604                }
605
606                *s = buffer;
607            }
608            StringChunk::Expr(_, ref mut indent) => {
609                if start_line {
610                    debug_assert!(current >= min);
611                    debug_assert!(expr_on_line.is_none());
612                    *indent = current - min;
613                    start_line = false;
614                    expr_on_line = Some(index);
615                } else if let Some(expr_index) = expr_on_line.take() {
616                    unindent.push(expr_index);
617                }
618            }
619        }
620    }
621
622    for index in unindent.into_iter() {
623        match chunks.get_mut(index) {
624            Some(StringChunk::Expr(_, ref mut indent)) => *indent = 0,
625            _ => unreachable!(
626                "all elements in `unindent` should be expressions, but found a literal"
627            ),
628        }
629    }
630}
631
632#[cfg(test)]
633mod tests {
634    use crate::{
635        combine::Combine,
636        label::Label,
637        term::{LabeledType, TypeAnnotation},
638        typ::{Type, TypeF},
639    };
640
641    #[test]
642    fn contract_annotation_order() {
643        let ty1 = LabeledType {
644            typ: TypeF::Number.into(),
645            label: Label::dummy(),
646        };
647        let annot1 = TypeAnnotation {
648            typ: None,
649            contracts: vec![ty1.clone()],
650        };
651
652        let ty2 = LabeledType {
653            typ: TypeF::Bool.into(),
654            label: Label::dummy(),
655        };
656        let annot2 = TypeAnnotation {
657            typ: None,
658            contracts: vec![ty2.clone()],
659        };
660
661        assert_eq!(Combine::combine(annot1, annot2).contracts, vec![ty1, ty2])
662    }
663
664    /// Regression test for issue [#548](https://github.com/tweag/nickel/issues/548)
665    #[test]
666    fn type_annotation_combine() {
667        let inner = TypeAnnotation {
668            typ: Some(LabeledType {
669                typ: Type::from(TypeF::Number),
670                label: Label::dummy(),
671            }),
672            ..Default::default()
673        };
674        let outer = TypeAnnotation::default();
675        let res = TypeAnnotation::combine(outer, inner);
676        assert_ne!(res.typ, None);
677    }
678}