nickel_lang_core/parser/utils.rs
1//! Various helpers and companion code for the parser are put here to keep the grammar definition
2//! uncluttered.
3use std::{
4 ffi::OsString,
5 iter,
6 {collections::HashSet, fmt::Debug},
7};
8
9use super::error::ParseError;
10
11use crate::{
12 app,
13 bytecode::ast::{
14 pattern::bindings::Bindings as _,
15 record::{FieldDef, FieldMetadata, Include},
16 *,
17 },
18 cache::InputFormat,
19 combine::CombineAlloc,
20 eval::merge::merge_doc,
21 files::FileId,
22 fun,
23 identifier::LocIdent,
24 position::{RawSpan, TermPos},
25 primop_app,
26};
27
28use malachite::{
29 base::num::conversion::traits::{FromSciString, FromStringBase},
30 Integer,
31};
32
33pub struct ParseNumberError;
34
35pub fn parse_number_sci(slice: &str) -> Result<Number, ParseNumberError> {
36 Number::from_sci_string(slice).ok_or(ParseNumberError)
37}
38
39pub fn parse_number_base(base: u8, slice: &str) -> Result<Number, ParseNumberError> {
40 Ok(Number::from(
41 Integer::from_string_base(base, slice).ok_or(ParseNumberError)?,
42 ))
43}
44
45/// Distinguish between the standard string opening delimiter `"`, the multi-line string
46/// opening delimter `m%"`, and the symbolic string opening delimiter `s%"`.
47#[derive(Copy, Clone, Eq, PartialEq, Debug)]
48pub enum StringStartDelimiter<'input> {
49 Standard,
50 Multiline,
51 Symbolic(&'input str),
52}
53
54impl StringStartDelimiter<'_> {
55 pub fn is_closed_by(&self, close: &StringEndDelimiter) -> bool {
56 matches!(
57 (self, close),
58 (StringStartDelimiter::Standard, StringEndDelimiter::Standard)
59 | (StringStartDelimiter::Multiline, StringEndDelimiter::Special)
60 | (
61 StringStartDelimiter::Symbolic(_),
62 StringEndDelimiter::Special
63 )
64 )
65 }
66
67 pub fn needs_strip_indent(&self) -> bool {
68 match self {
69 StringStartDelimiter::Standard => false,
70 StringStartDelimiter::Multiline | StringStartDelimiter::Symbolic(_) => true,
71 }
72 }
73}
74
75/// Distinguish between the standard string closing delimiter `"` and the "special" string
76/// closing delimiter `"%`.
77#[derive(Copy, Clone, Debug, Eq, PartialEq)]
78pub enum StringEndDelimiter {
79 Standard,
80 Special,
81}
82
83/// A string chunk literal atom, being either a string or a single char.
84///
85/// Because of the way the lexer handles escaping and interpolation, a contiguous static string
86/// `"Some \\ \%{escaped} string"` will be lexed as a sequence of such atoms.
87#[derive(Clone, Debug, Eq, PartialEq)]
88pub enum ChunkLiteralPart {
89 Str(String),
90 Char(char),
91}
92
93/// The last field of a record, that can either be a normal field declaration or an ellipsis.
94#[allow(
95 clippy::large_enum_variant,
96 reason = "the large variant is the common one"
97)]
98#[derive(Clone, Debug)]
99pub enum LastField<'ast> {
100 FieldDecl(FieldDecl<'ast>),
101 Ellipsis,
102}
103
104/// A record field declaration, that can be either a standard field definition or an `include`
105/// expression.
106#[derive(Clone, Debug)]
107pub enum FieldDecl<'ast> {
108 Def(FieldDef<'ast>),
109 Include(Include<'ast>),
110 IncludeList(Vec<Include<'ast>>),
111}
112
113/// The last match in a data structure pattern. This can either be a normal match, or an ellipsis
114/// which can capture the rest of the data structure. The type parameter `P` is the type of the
115/// pattern of the data structure (ellipsis are supported for both array and record patterns).
116///
117/// # Example
118///
119/// - In `{foo={}, bar}`, the last match is an normal match.
120/// - In `{foo={}, bar, ..}`, the last match is a non-capturing ellipsis.
121/// - In `{foo={}, bar, ..rest}`, the last match is a capturing ellipsis.
122#[derive(Debug, PartialEq, Clone)]
123pub enum LastPattern<P> {
124 /// The last field is a normal match. In this case the pattern is "closed" so every record
125 /// fields should be matched.
126 Normal(P),
127 /// The pattern is "open" `, ..}`. Optionally you can bind a record containing the remaining
128 /// fields to an `Identifier` using the syntax `, ..y}`.
129 Ellipsis(Option<LocIdent>),
130}
131
132/// Trait for operators that can be eta-expanded to a function.
133pub(super) trait EtaExpand {
134 /// Eta-expand an operator. This wraps an operator, for example `==`, as a function `fun x1 x2
135 /// => x1 == x2`. Propagate the position of the curried operator to the generated primop apps
136 /// for better error reporting.
137 fn eta_expand(self, alloc: &AstAlloc, pos: TermPos) -> Node<'_>;
138}
139
140/// An infix operator that is not applied. Used for the curried operator syntax (e.g `(==)`)
141pub(super) struct InfixOp(pub(super) primop::PrimOp);
142
143impl EtaExpand for InfixOp {
144 fn eta_expand(self, alloc: &AstAlloc, pos: TermPos) -> Node<'_> {
145 // We could use `LocIdent::fresh` for the newly introduced function parameters. However,
146 // it has the issue that pretty printing them doesn't result in valid Nickel anymore. This
147 // is why we prefer normal identifier like `x` or `y`.
148 match self {
149 // We treat `UnaryOp::BoolAnd` and `UnaryOp::BoolOr` separately.
150 //
151 // They are unary operators taking a second lazy argument, but the current mainine
152 // evaluator expects that they are always fully applied (including to their argument).
153 // That is, Nickel currently doesn't support a partial application like `%bool_or%
154 // <arg1>` (which is fine, because the latter isn't actually representable in the
155 // source language: `BoolOr` is only expressible through the infix syntax `<arg1> ||
156 // <arg2>`). Thus, instead of eta-expanding to `fun x => <op> x` as we would for other
157 // unary operators, we eta-expand to `fun x1 x2 => <op> x1 x2`.
158 InfixOp(op @ primop::PrimOp::BoolAnd) | InfixOp(op @ primop::PrimOp::BoolOr) => {
159 let fst_arg = LocIdent::from("x");
160 let snd_arg = LocIdent::from("y");
161
162 fun!(
163 alloc,
164 fst_arg,
165 snd_arg,
166 app!(
167 alloc,
168 primop_app!(alloc, op, builder::var(fst_arg)),
169 builder::var(snd_arg),
170 )
171 .with_pos(pos),
172 )
173 .node
174 }
175 // `RecordGet field record` corresponds to `record."%{field}"`. Using the curried
176 // version `(.)` has thus reversed argument corresponding to the `RecordGet` primop, so
177 // we need to flip them.
178 InfixOp(op @ primop::PrimOp::RecordGet) => {
179 let fst_arg = LocIdent::new("x");
180 let snd_arg = LocIdent::new("y");
181
182 fun!(
183 alloc,
184 fst_arg,
185 snd_arg,
186 primop_app!(alloc, op, builder::var(snd_arg), builder::var(fst_arg))
187 .with_pos(pos),
188 )
189 .node
190 }
191 InfixOp(op) => {
192 let vars: Vec<_> = (0..op.arity())
193 .map(|i| LocIdent::from(format!("x{i}")))
194 .collect();
195 let fun_args: Vec<_> = vars.iter().map(|arg| pattern::Pattern::any(*arg)).collect();
196 let args: Vec<_> = vars.into_iter().map(builder::var).collect();
197
198 alloc.fun(fun_args, alloc.prim_op(op, args).spanned(pos))
199 }
200 }
201 }
202}
203
204/// Additional infix operators that aren't proper primitive operations in the Nickel AST but are
205/// still available in the surface syntax (and desugared at parsing time). They can still be used
206/// in a curried form so they need a wrapper and an `EtaExpand` implementation.
207pub(super) enum ExtendedInfixOp {
208 /// The reverse application operation or pipe operator `|>`.
209 ReverseApp,
210 /// The inequality operator `!=`.
211 NotEqual,
212}
213
214impl EtaExpand for ExtendedInfixOp {
215 fn eta_expand(self, alloc: &AstAlloc, pos: TermPos) -> Node<'_> {
216 match self {
217 ExtendedInfixOp::ReverseApp => {
218 let fst_arg = LocIdent::from("x");
219 let snd_arg = LocIdent::from("y");
220
221 fun!(
222 alloc,
223 fst_arg,
224 snd_arg,
225 app!(alloc, builder::var(snd_arg), builder::var(fst_arg)).with_pos(pos),
226 )
227 .node
228 }
229 ExtendedInfixOp::NotEqual => {
230 let fst_arg = LocIdent::from("x");
231 let snd_arg = LocIdent::from("y");
232
233 fun!(
234 alloc,
235 fst_arg,
236 snd_arg,
237 primop_app!(
238 alloc,
239 primop::PrimOp::BoolNot,
240 primop_app!(
241 alloc,
242 primop::PrimOp::Eq,
243 builder::var(fst_arg),
244 builder::var(snd_arg),
245 )
246 .with_pos(pos),
247 )
248 .with_pos(pos),
249 )
250 .node
251 }
252 }
253 }
254}
255
256/// Trait for structures representing annotations which can be combined with a term to build
257/// another term, or another structure holding a term, such as a field. `T` is the said target
258/// structure.
259pub trait AttachToAst<'ast, T> {
260 fn attach_to_ast(self, alloc: &'ast AstAlloc, ast: Ast<'ast>) -> T;
261}
262
263impl<'ast> CombineAlloc<'ast> for FieldMetadata<'ast> {
264 /// Combine two field metadata into one. If data that can't be combined (typically, the
265 /// documentation or the type annotation) are set by both, the left one's are kept.
266 fn combine(alloc: &'ast AstAlloc, left: Self, right: Self) -> Self {
267 let priority = match (left.priority, right.priority) {
268 // Neutral corresponds to the case where no priority was specified. In that case, the
269 // other priority takes precedence.
270 (MergePriority::Neutral, p) | (p, MergePriority::Neutral) => p,
271 // Otherwise, we keep the maximum of both priorities, as we would do when merging
272 // values.
273 (p1, p2) => std::cmp::max(p1, p2),
274 };
275
276 FieldMetadata {
277 doc: merge_doc(left.doc, right.doc),
278 annotation: CombineAlloc::combine(alloc, left.annotation, right.annotation),
279 opt: left.opt || right.opt,
280 // The resulting field will be suppressed from serialization if either of the fields to be merged is.
281 not_exported: left.not_exported || right.not_exported,
282 priority,
283 }
284 }
285}
286
287impl<'ast> CombineAlloc<'ast> for LetMetadata<'ast> {
288 /// Combine two let metadata into one. Same as `FieldMetadata::combine` but restricted to the
289 /// metadata that can be associated to a let block.
290 fn combine(alloc: &'ast AstAlloc, left: Self, right: Self) -> Self {
291 LetMetadata {
292 doc: merge_doc(left.doc, right.doc),
293 annotation: CombineAlloc::combine(alloc, left.annotation, right.annotation),
294 }
295 }
296}
297
298impl<'ast> CombineAlloc<'ast> for Annotation<'ast> {
299 /// Combine two annotations. If both have `types` set, the final type
300 /// is the one of the left annotation, while the right one's type is put
301 /// inside the final `contracts`.
302 ///
303 /// Contracts are combined from left to right; the left one's are put first,
304 /// then maybe the right one's type annotation and then the right one's
305 /// contracts.
306 fn combine(alloc: &'ast AstAlloc, left: Self, right: Self) -> Self {
307 let (typ, leftover) = match (left.typ, right.typ) {
308 (left_ty @ Some(_), right_ty @ Some(_)) => (left_ty, right_ty),
309 (left_ty, right_ty) => (left_ty.or(right_ty), None),
310 };
311
312 let contracts: Vec<_> = left
313 .contracts
314 .iter()
315 .cloned()
316 .chain(leftover)
317 .chain(right.contracts.iter().cloned())
318 .collect();
319
320 alloc.annotation(typ, contracts)
321 }
322}
323
324impl<'ast> AttachToAst<'ast, Ast<'ast>> for Annotation<'ast> {
325 fn attach_to_ast(self, alloc: &'ast AstAlloc, ast: Ast<'ast>) -> Ast<'ast> {
326 if self.is_empty() {
327 return ast;
328 }
329
330 let pos = ast.pos;
331 Ast {
332 node: alloc.annotated(self, ast),
333 pos,
334 }
335 }
336}
337
338/// Takes a record access written as `foo."<access>"`, and either turn it into a static access
339/// whenever possible (when `<access>` is a static string without interpolation), or into a dynamic
340/// `%record/get%` access otherwise.
341pub fn mk_access<'ast>(alloc: &'ast AstAlloc, access: Ast<'ast>, root: Ast<'ast>) -> Node<'ast> {
342 if let Some(label) = access.node.try_str_chunk_as_static_str() {
343 alloc.prim_op(
344 primop::PrimOp::RecordStatAccess(LocIdent::new_with_pos(label, access.pos)),
345 iter::once(root),
346 )
347 } else {
348 alloc.prim_op(primop::PrimOp::RecordGet, [access, root])
349 }
350}
351
352/// Make a span from parser byte offsets.
353pub fn mk_span(src_id: FileId, l: usize, r: usize) -> RawSpan {
354 RawSpan {
355 src_id,
356 start: (l as u32).into(),
357 end: (r as u32).into(),
358 }
359}
360
361pub fn mk_pos(src_id: FileId, l: usize, r: usize) -> TermPos {
362 TermPos::Original(mk_span(src_id, l, r))
363}
364
365/// Checks that there are no duplicate bindings in a let block (when bindins are simple, that is
366/// they aren't pattern), and builds the corresponding let block node if the check passes.
367pub fn mk_let<'ast>(
368 alloc: &'ast AstAlloc,
369 rec: bool,
370 bindings: Vec<LetBinding<'ast>>,
371 body: Ast<'ast>,
372) -> Result<Node<'ast>, ParseError> {
373 // Check for duplicate names across the different bindings. We
374 // don't check for duplicate names within a single binding because
375 // there are backwards-compatibility constraints (e.g., see
376 // `RecordPattern::check_dup`).
377 let mut seen_bindings: HashSet<LocIdent> = HashSet::new();
378
379 for b in &bindings {
380 let new_bindings = b.pattern.bindings();
381 for binding in &new_bindings {
382 if let Some(old) = seen_bindings.get(&binding.id) {
383 return Err(ParseError::DuplicateIdentInLetBlock {
384 ident: binding.id,
385 prev_ident: *old,
386 });
387 }
388 }
389
390 seen_bindings.extend(new_bindings.into_iter().map(|binding| binding.id));
391 }
392
393 Ok(alloc.let_block(bindings, body, rec))
394}
395
396pub fn mk_import_based_on_filename(
397 alloc: &AstAlloc,
398 path: String,
399 _span: RawSpan,
400) -> Result<Node<'_>, ParseError> {
401 let path = OsString::from(path);
402 let format: Option<InputFormat> = InputFormat::from_path(&path);
403
404 // Fall back to InputFormat::Nickel in case of unknown filename extension for backwards compatiblilty.
405 let format = format.unwrap_or_default();
406
407 Ok(alloc.import_path(path, format))
408}
409
410pub fn mk_import_explicit(
411 alloc: &AstAlloc,
412 path: String,
413 format: LocIdent,
414 span: RawSpan,
415) -> Result<Node<'_>, ParseError> {
416 let path = OsString::from(path);
417 let Ok(format) = format.label().parse::<InputFormat>() else {
418 return Err(ParseError::InvalidImportFormat { span });
419 };
420
421 Ok(alloc.import_path(path, format))
422}
423
424/// Determine the minimal level of indentation of a multi-line string.
425///
426/// The result is determined by computing the minimum indentation level among all lines, where the
427/// indentation level of a line is the number of consecutive whitespace characters, which are
428/// either a space or a tab, counted from the beginning of the line. If a line is empty or consist
429/// only of whitespace characters, it is ignored.
430pub fn min_indent(chunks: &[StringChunk<Ast<'_>>]) -> usize {
431 let mut min: usize = usize::MAX;
432 let mut current = 0;
433 let mut start_line = true;
434
435 for chunk in chunks.iter() {
436 match chunk {
437 StringChunk::Expr(_, _) if start_line => {
438 if current < min {
439 min = current;
440 }
441 start_line = false;
442 }
443 StringChunk::Expr(_, _) => (),
444 StringChunk::Literal(s) => {
445 for c in s.chars() {
446 match c {
447 ' ' | '\t' if start_line => current += 1,
448 '\n' => {
449 current = 0;
450 start_line = true;
451 }
452 _ if start_line => {
453 if current < min {
454 min = current;
455 }
456 start_line = false;
457 }
458 _ => (),
459 }
460 }
461 }
462 }
463 }
464
465 min
466}
467
468/// Strip the common indentation prefix from a multi-line string.
469///
470/// Determine the minimum indentation level of a multi-line string via [`min_indent`], and strip an
471/// equal number of whitespace characters (` ` or `\t`) from the beginning of each line. If the last
472/// line is empty or consist only of whitespace characters, it is filtered out.
473///
474/// The indentation of interpolated expressions in a multi-line string follow the rules:
475/// - if an interpolated expression is alone on a line with whitespaces, its indentation -- minus
476/// the common minimal indentation -- is stored and when the expression will be evaluated, each
477/// new line will be prepended with this indentation level.
478/// - if there are other non whitespace characters or interpolated expressions on the line, then it
479/// is just replaced by its content. The common indentation is still stripped before the start of
480/// this expression, but newlines inside it won't be affected..
481///
482/// Examples:
483///
484/// ```text
485/// let x = "I\nam\nindented" in
486/// m%"
487/// baseline
488/// ${x}
489/// end
490/// "%
491/// ```
492///
493/// gives
494///
495/// ```text
496///"baseline
497/// I
498/// am
499/// indented
500/// end"
501/// ```
502///
503/// While
504///
505/// ```text
506/// let x = "I\nam\nnot" in
507/// m%"
508/// baseline
509/// ${x} sth
510/// end
511/// "%
512/// ```
513///
514/// gives
515///
516/// ```text
517///"baseline
518/// I
519///am
520///not sth
521/// end"
522/// ```
523pub fn strip_indent(chunks: &mut [StringChunk<Ast<'_>>]) {
524 if chunks.is_empty() {
525 return;
526 }
527
528 let min = min_indent(chunks);
529 let mut current = 0;
530 let mut start_line = true;
531 let chunks_len = chunks.len();
532
533 // When processing a line with an indented interpolated expression, as in:
534 //
535 // ```
536 // m%"
537 // some
538 // ${x} ${y}
539 // ${x}
540 // string
541 // "%
542 // ```
543 //
544 // We don't know at the time we process the expression `${x}` if it wil have to be re-indented,
545 // as it depends on the rest of the line being only whitespace or not, according to the
546 // indentation rule. Here, the first occurrence should not, while the second one should. We can
547 // only know this once we process the next chunks, here when arriving at `${y}`. To handle
548 // this, we set all indentation levels as if expressions were alone on their line during the
549 // main loop, but also store the index of such chunks which indentation level must be revisited
550 // once the information becomes available. Then, their indentation level is erased in a last
551 // pass.
552 let mut unindent: Vec<usize> = Vec::new();
553 let mut expr_on_line: Option<usize> = None;
554
555 for (index, chunk) in chunks.iter_mut().enumerate() {
556 match chunk {
557 StringChunk::Literal(ref mut s) => {
558 let mut buffer = String::new();
559 for c in s.chars() {
560 match c {
561 ' ' | '\t' if start_line && current < min => current += 1,
562 ' ' | '\t' if start_line => {
563 current += 1;
564 buffer.push(c);
565 }
566 '\n' => {
567 current = 0;
568 start_line = true;
569 expr_on_line = None;
570 buffer.push(c);
571 }
572 c if start_line => {
573 start_line = false;
574 buffer.push(c);
575 }
576 c => buffer.push(c),
577 }
578 }
579
580 // Strip the first line, if it is only whitespace characters
581 if index == 0 {
582 if let Some(first_index) = buffer.find('\n') {
583 if first_index == 0
584 || buffer.as_bytes()[..first_index]
585 .iter()
586 .all(|c| *c == b' ' || *c == b'\t')
587 {
588 buffer = String::from(&buffer[(first_index + 1)..]);
589 }
590 }
591 }
592
593 // Strip the last line, if it is only whitespace characters.
594 if index == chunks_len - 1 {
595 if let Some(last_index) = buffer.rfind('\n') {
596 if last_index == buffer.len() - 1
597 || buffer.as_bytes()[(last_index + 1)..]
598 .iter()
599 .all(|c| *c == b' ' || *c == b'\t')
600 {
601 buffer.truncate(last_index);
602 }
603 }
604 }
605
606 *s = buffer;
607 }
608 StringChunk::Expr(_, ref mut indent) => {
609 if start_line {
610 debug_assert!(current >= min);
611 debug_assert!(expr_on_line.is_none());
612 *indent = current - min;
613 start_line = false;
614 expr_on_line = Some(index);
615 } else if let Some(expr_index) = expr_on_line.take() {
616 unindent.push(expr_index);
617 }
618 }
619 }
620 }
621
622 for index in unindent.into_iter() {
623 match chunks.get_mut(index) {
624 Some(StringChunk::Expr(_, ref mut indent)) => *indent = 0,
625 _ => unreachable!(
626 "all elements in `unindent` should be expressions, but found a literal"
627 ),
628 }
629 }
630}
631
632#[cfg(test)]
633mod tests {
634 use crate::{
635 combine::Combine,
636 label::Label,
637 term::{LabeledType, TypeAnnotation},
638 typ::{Type, TypeF},
639 };
640
641 #[test]
642 fn contract_annotation_order() {
643 let ty1 = LabeledType {
644 typ: TypeF::Number.into(),
645 label: Label::dummy(),
646 };
647 let annot1 = TypeAnnotation {
648 typ: None,
649 contracts: vec![ty1.clone()],
650 };
651
652 let ty2 = LabeledType {
653 typ: TypeF::Bool.into(),
654 label: Label::dummy(),
655 };
656 let annot2 = TypeAnnotation {
657 typ: None,
658 contracts: vec![ty2.clone()],
659 };
660
661 assert_eq!(Combine::combine(annot1, annot2).contracts, vec![ty1, ty2])
662 }
663
664 /// Regression test for issue [#548](https://github.com/tweag/nickel/issues/548)
665 #[test]
666 fn type_annotation_combine() {
667 let inner = TypeAnnotation {
668 typ: Some(LabeledType {
669 typ: Type::from(TypeF::Number),
670 label: Label::dummy(),
671 }),
672 ..Default::default()
673 };
674 let outer = TypeAnnotation::default();
675 let res = TypeAnnotation::combine(outer, inner);
676 assert_ne!(res.typ, None);
677 }
678}