mech_syntax/
lib.rs

1// # Syntax
2
3#![cfg_attr(feature = "no-std", no_std)]
4#![cfg_attr(feature = "no-std", alloc)]
5#![feature(extract_if)]
6#![feature(get_mut_unchecked)]
7#![allow(dead_code)]
8#![allow(warnings)]
9#![feature(step_trait)]
10
11extern crate mech_core;
12#[cfg(feature="no-std")] #[macro_use] extern crate alloc;
13#[cfg(not(feature = "no-std"))] extern crate core;
14extern crate hashbrown;
15extern crate nom;
16extern crate nom_unicode;
17extern crate tabled;
18
19use mech_core::*;
20use mech_core::nodes::*;
21use std::cell::RefCell;
22use std::rc::Rc;
23
24#[cfg(not(feature = "no-std"))] use core::fmt;
25#[cfg(feature = "no-std")] use alloc::fmt;
26#[cfg(feature = "no-std")] use alloc::string::String;
27#[cfg(feature = "no-std")] use alloc::vec::Vec;
28use nom::{
29  IResult,
30  branch::alt,
31  sequence::tuple,
32  combinator::{opt, eof},
33  multi::{many1, many_till, many0, separated_list1},
34  Err,
35};
36
37use std::collections::HashMap;
38use colored::*;
39
40pub mod mechdown;
41pub mod expressions;
42pub mod statements;
43pub mod structures;
44pub mod base;
45pub mod parser;
46pub mod formatter;
47pub mod grammar;
48pub mod literals;
49pub mod state_machines;
50pub mod functions;
51pub mod repl;
52
53pub use crate::parser::*;
54pub use crate::mechdown::*;
55pub use crate::expressions::*;
56pub use crate::statements::*;
57pub use crate::structures::*;
58pub use crate::base::*;
59pub use crate::formatter::*;
60pub use crate::grammar::*;
61pub use crate::literals::*;
62pub use crate::state_machines::*;
63pub use crate::functions::*;
64pub use crate::repl::*;
65
66
67/// Unicode grapheme group utilities.
68/// Current implementation does not guarantee correct behavior for
69/// all possible unicode characters.
70pub mod graphemes {
71  use unicode_segmentation::UnicodeSegmentation;
72
73  /// Obtain unicode grapheme groups from input source, then make sure
74  /// it ends with new_line.  Many functions in the parser assume input
75  /// ends with new_line.
76  pub fn init_source(text: &str) -> Vec<&str> {
77    let mut graphemes = UnicodeSegmentation::graphemes(text, true).collect::<Vec<&str>>();
78    graphemes.push("\n");
79    graphemes
80  }
81
82  pub fn init_tag(tag: &str) -> Vec<&str> {
83    UnicodeSegmentation::graphemes(tag, true).collect::<Vec<&str>>()
84  }
85
86  pub fn is_new_line(grapheme: &str) -> bool {
87    match grapheme {
88      "\r" | "\n" | "\r\n" => true,
89      _ => false,
90    }
91  }
92
93  pub fn is_numeric(grapheme: &str) -> bool {
94    grapheme.chars().next().unwrap().is_numeric()
95  }
96
97  pub fn is_alpha(grapheme: &str) -> bool {
98    grapheme.chars().next().unwrap().is_alphabetic()
99  }
100
101  pub fn is_emoji(grapheme: &str) -> bool {
102    let ch = grapheme.chars().next().unwrap();
103    !(ch.is_alphanumeric() || ch.is_ascii())
104  }
105
106  pub fn width(grapheme: &str) -> usize {
107    // TODO: uniode width?
108    let ch = grapheme.chars().next().unwrap();
109    if ch == '\t' {
110      1
111    } else if ch.is_control() {
112      0
113    } else {
114      1
115    }
116  }
117}
118
119/// Just alias
120pub type ParseResult<'a, O> = IResult<ParseString<'a>, O, ParseError<'a>>;
121
122/// The input type for nom parsers. Instead of holding the actual input
123/// string, this struct only holds a reference to that string so that it
124/// can be cloned at much lower cost.
125#[derive(Clone, Debug)]
126pub struct ParseString<'a> {
127  /// Source code
128  pub graphemes: &'a Vec<&'a str>,
129  /// Error report, a list of (error_location, error_context)
130  pub error_log: Vec<(SourceRange, ParseErrorDetail)>,
131  /// Point at the next grapheme to consume
132  pub cursor: usize,
133  /// Location of the grapheme pointed by cursor
134  pub location: SourceLocation,
135}
136
137impl<'a> ParseString<'a> {
138  /// Must always point a an actual string
139  pub fn new(graphemes: &'a Vec<&'a str>) -> Self {
140    ParseString {
141      graphemes,
142      error_log: vec![],
143      cursor: 0,
144      location: SourceLocation { row: 1, col: 1 },
145    }
146  }
147
148  pub fn rest(&self) -> String {
149    // Return the rest of the string from current cursor
150    let mut s = String::new();
151    for i in self.cursor..self.graphemes.len() {
152      s.push_str(self.graphemes[i]);
153    }
154    s
155  }
156
157  pub fn peek(&self, n: usize) -> Option<&str> {
158    self.graphemes.get(self.cursor + n).copied()
159  }
160
161  pub fn current(&self) -> Option<&str> {
162    self.graphemes.get(self.cursor).copied()
163  }
164
165  pub fn next(&self) -> Option<&str> {
166    self.graphemes.get(self.cursor + 1).copied()
167  }
168
169  /// If current location matches the tag, consume the matched string.
170  fn consume_tag(&mut self, tag: &str) -> Option<String> {
171    if self.is_empty() {
172      return None;
173    }
174    let current = self.graphemes[self.cursor];
175
176    let gs = graphemes::init_tag(tag); 
177    let gs_len = gs.len();
178
179    // Must have enough remaining characters
180    if self.len() < gs_len {
181      return None;
182    }
183
184    // Try to match the tag
185    let mut tmp_location = self.location;
186    for i in 0..gs_len {
187      let c = self.cursor + i;
188      let g = self.graphemes[c];
189      if g != gs[i] {
190        return None;
191      }
192      if graphemes::is_new_line(g) {
193        if !self.is_last_grapheme(c) {
194          tmp_location.row += 1;
195          tmp_location.col = 1;
196        }
197      } else {
198        tmp_location.col += graphemes::width(g);
199      }
200    }
201    // Tag matched, commit change
202    self.cursor += gs_len;
203    self.location = tmp_location;
204    Some(tag.to_string())
205  }
206
207  /// Mutate self by consuming one grapheme
208  fn consume_one(&mut self) -> Option<String> {
209    if self.is_empty() {
210      return None;
211    }
212    let g = self.graphemes[self.cursor];
213    if graphemes::is_new_line(g) {
214      if !self.is_last_grapheme(self.cursor) {
215        self.location.row += 1;
216        self.location.col = 1;
217      }
218    } else {
219      self.location.col += graphemes::width(g);
220    }
221    self.cursor += 1;
222    Some(g.to_string())
223  }
224
225
226  /// If current location matches any emoji, consume the matched string.
227  fn consume_emoji(&mut self) -> Option<String> {
228    if self.is_empty() {
229      return None;
230    }
231    let g = self.graphemes[self.cursor];
232    
233    if graphemes::is_emoji(g) {
234      self.cursor += 1;
235      self.location.col += graphemes::width(g);
236      Some(g.to_string())
237    } else {
238      None
239    }
240  }
241
242  /// If current location matches any alpha char, consume the matched string.
243  fn consume_alpha(&mut self) -> Option<String> {
244    if self.is_empty() {
245      return None;
246    }
247    let g = self.graphemes[self.cursor];
248    if graphemes::is_alpha(g) {
249      self.cursor += 1;
250      self.location.col += graphemes::width(g);
251      Some(g.to_string())
252    } else {
253      None
254    }
255  }
256
257  /// If current location matches any digit, consume the matched string.
258  fn consume_digit(&mut self) -> Option<String> {
259    if self.is_empty() {
260      return None;
261    }
262    let g = self.graphemes[self.cursor];
263    if graphemes::is_numeric(g) {
264      self.cursor += 1;
265      self.location.col += graphemes::width(g);
266      Some(g.to_string())
267    } else {
268      None
269    }
270  }
271
272  /// Get cursor's location in source code
273  fn loc(&self) -> SourceLocation {
274    self.location
275  }
276
277  /// Test whether the grapheme pointed by cursor is the last grapheme
278  fn is_last_grapheme(&self, c: usize) -> bool {
279    (self.graphemes.len() - 1 - c) == 0
280  }
281
282  /// Get remaining (unparsed) length
283  pub fn len(&self) -> usize {
284    self.graphemes.len() - self.cursor
285  }
286  
287  pub fn is_empty(&self) -> bool {
288    self.len() == 0
289  }
290
291  /// For debug purpose
292  fn output(&self) {
293              
294    println!("───────────────────{}", self.len());
295    for i in self.cursor..self.graphemes.len() {
296      print!("{}", self.graphemes[i]);
297    }
298    println!();
299    println!("───────────────────");
300  }
301}
302
303/// Required by nom
304impl<'a> nom::InputLength for ParseString<'a> {
305  fn input_len(&self) -> usize {
306    self.len()
307  }
308}
309
310/// The part of error context that's independent to its cause location.
311#[derive(Clone, Debug)]
312pub struct ParseErrorDetail {
313  pub message: &'static str,
314  pub annotation_rngs: Vec<SourceRange>,
315}
316
317/// The error type for the nom parser, which handles full error context
318/// (location + detail) and ownership of the input ParseString.
319///
320/// Eventually error context will be logged and ownership will be moved out.
321#[derive(Clone, Debug)]
322pub struct ParseError<'a> {
323  /// Cause range is defined as [start, end), where `start` points at the first
324  /// character that's catched by a label, and `end` points at the next 
325  /// character of the character that didn't match.
326  ///
327  /// Example:
328  ///   index:  1234567
329  ///   input:  abcdefg
330  ///   error:   ~~~^
331  ///   range:   |   |
332  ///           [2,  5)
333  ///
334  pub cause_range: SourceRange,
335  /// Hold ownership to the input ParseString
336  pub remaining_input: ParseString<'a>,
337  /// Detailed information about this error
338  pub error_detail: ParseErrorDetail,
339}
340
341impl<'a> ParseError<'a> {
342  /// Create a new error at current location of the input, with given message
343  /// and empty annotations.  Ownership of the input is also passed into this
344  /// error object.
345  pub fn new(input: ParseString<'a>, msg: &'static str) -> Self {
346    let start = input.loc();
347    let mut end = start;
348    end.col += 1;
349    ParseError {
350      cause_range: SourceRange { start, end },
351      remaining_input: input,
352      error_detail: ParseErrorDetail {
353        message: msg,
354        annotation_rngs: vec![],
355      }
356    }
357  }
358
359  /// Add self to the error log of input string.
360  fn log(&mut self) {
361    self.remaining_input.error_log.push((self.cause_range, self.error_detail.clone()));
362  }
363}
364
365/// Required by nom
366impl<'a> nom::error::ParseError<ParseString<'a>> for ParseError<'a> {
367  /// Not used, unless we have logical error
368  fn from_error_kind(input: ParseString<'a>,
369                      _kind: nom::error::ErrorKind) -> Self {
370    ParseError::new(input, "Unexpected error")
371  }
372
373  /// Probably not used
374  fn append(_input: ParseString<'a>,
375            _kind: nom::error::ErrorKind,
376            other: Self) -> Self {
377    other
378  }
379
380  /// Barely used, but we do want to keep the error with larger depth.
381  fn or(self, other: Self) -> Self {
382    let self_start = self.cause_range.start;
383    let other_start = other.cause_range.start;
384    if self_start > other_start {
385      self
386    } else {
387      other
388    }
389  }
390}
391
392/// This struct is responsible for analysing text, interpreting indices
393/// and ranges, and producing formatted messages.
394pub struct TextFormatter<'a> {
395  graphemes: Vec<&'a str>,
396  line_beginnings: Vec<usize>,
397  end_index: usize,
398}
399
400impl<'a> TextFormatter<'a> {
401  pub fn new(text: &'a str) -> Self {
402    let graphemes = graphemes::init_source(text);
403    let mut line_beginnings = vec![0];
404    for i in 0..graphemes.len() {
405      if graphemes::is_new_line(graphemes[i]) {
406        line_beginnings.push(i + 1);
407      }
408    }
409    line_beginnings.pop();
410    TextFormatter {
411      end_index: graphemes.len(),
412      graphemes,
413      line_beginnings,
414    }
415  }
416
417  // Index interpreter
418
419  fn get_line_range(&self, linenum: usize) -> Option<(usize, usize)> {
420    let line_index = linenum - 1;
421    if line_index >= self.line_beginnings.len() {
422      return None;
423    }
424    if linenum == self.line_beginnings.len() {  // asking for the last line
425      return Some((self.line_beginnings[line_index], self.end_index));
426    }
427    Some((self.line_beginnings[line_index], self.line_beginnings[linenum]))
428  }
429
430  fn get_text_by_linenum(&self, linenum: usize) -> String {
431    let (start, end) = match self.get_line_range(linenum) {
432      Some(v) => v,
433      None => return "\n".to_string(),
434    };
435    let mut s = self.graphemes[start..end].iter().map(|s| *s).collect::<String>();
436    if !s.ends_with("\n") {
437      s.push('\n');
438    }
439    s
440  }
441
442  fn get_textlen_by_linenum(&self, linenum: usize) -> usize {
443    let (start, end) = match self.get_line_range(linenum) {
444      Some(v) => v,
445      None => return 1,
446    };
447    let mut len = 0;
448    for i in start..end {
449      len += graphemes::width(self.graphemes[i]);
450    }
451    len + 1
452  }
453
454  // FormattedString printer
455
456  fn heading_color(s: &str) -> String {
457    s.truecolor(246, 192, 78).bold().to_string()
458  }
459
460  fn location_color(s: &str) -> String {
461    s.truecolor(0,187,204).bold().to_string()
462  }
463
464  fn linenum_color(s: &str) -> String {
465    s.truecolor(0,187,204).bold().to_string()
466  }
467
468  fn text_color(s: &str) -> String {
469    s.to_string()
470  }
471
472  fn annotation_color(s: &str) -> String {
473    s.truecolor(102,51,153).bold().to_string()
474  }
475
476  fn error_color(s: &str) -> String {
477    s.truecolor(170,51,85).bold().to_string()
478  }
479
480  fn ending_color(s: &str) -> String {
481    s.truecolor(246, 192, 78).bold().to_string()
482  }
483
484  fn err_heading(index: usize) -> String {
485    let n = index + 1;
486    let d = "────────────────────────";
487    let s = format!("{} syntax error #{} {}\n", d, n, d);
488    Self::heading_color(&s)
489  }
490
491  fn err_location(&self, ctx: &ParserErrorContext) -> String {
492    let err_end = ctx.cause_rng.end;
493    // error range will not ends at first column, so `minus 1` here is safe
494    let (row, col) = (err_end.row, err_end.col - 1);
495    let s = format!("@location:{}:{}\n", row, col);
496    Self::location_color(&s)
497  }
498
499  fn err_context(&self, ctx: &ParserErrorContext) -> String {
500    let mut result = String::new();
501
502    let mut annotation_rngs = ctx.annotation_rngs.clone();
503    annotation_rngs.push(ctx.cause_rng);
504
505    // the lines to print (1-indexed)
506    let mut lines_to_print: Vec<usize> = vec![];
507    for rng in &annotation_rngs {
508      let r1 = rng.start.row;
509      // if range ends at first column, it doesn't reach that row
510      let r2 = if rng.end.col == 1 {
511        usize::max(rng.start.row, rng.end.row - 1)
512      } else {
513        rng.end.row
514      };
515      for i in r1..=r2 {
516        lines_to_print.push(i);
517      }
518    }
519    lines_to_print.sort();
520    lines_to_print.dedup();
521
522    // the annotations on each line
523    // <linenum, Vec<(start_col, rng_len, is_major, is_cause)>>
524    let mut range_table: HashMap<usize, Vec<(usize, usize, bool, bool)>> = HashMap::new();
525    for linenum in &lines_to_print {
526      range_table.insert(*linenum, vec![]);
527    }
528    let n = annotation_rngs.len() - 1;  // if i == n, it's the last rng, i.e. the cause rng
529    for (i, rng) in annotation_rngs.iter().enumerate() {
530      // c2 might be 0
531      let (r1, c1) = (rng.start.row, rng.start.col);
532      let (r2, c2) = (rng.end.row, rng.end.col - 1);
533      if r1 == r2 {  // the entire range is on one line
534        if c2 >= c1 {  // and the range has non-zero length
535          range_table.get_mut(&r1).unwrap().push((c1, c2 - c1 + 1, true, i == n));
536        }
537      } else {  // the range spans over multiple lines
538        range_table.get_mut(&r1).unwrap().push((c1, usize::MAX, i != n, i == n));
539        for r in r1+1..r2 {
540          range_table.get_mut(&r).unwrap().push((1, usize::MAX, false, i == n));
541        }
542        if c2 != 0 {  // only add the last line if it hfnas non-zero length
543          range_table.get_mut(&r2).unwrap().push((1, c2, i == n, i == n));
544        }
545      }
546    }
547
548    // other data for printing
549    let dots = "...";
550    let indentation = " ";
551    let vert_split1 = " │";
552    let vert_split2 = "  ";
553    let arrow = "^";
554    let tilde = "~";
555    let lines_str: Vec<String> = lines_to_print.iter().map(|i| i.to_string()).collect();
556    let row_str_len = usize::max(lines_str.last().unwrap().len(), dots.len());
557
558    // print source code
559    for i in 0..lines_to_print.len() {
560      // [... | ]
561      if i != 0 && (lines_to_print[i] - lines_to_print[i-1] != 1) {
562        result.push_str(indentation);
563        for _ in 3..row_str_len { result.push(' '); }
564        result.push_str(&Self::linenum_color(dots));
565        result.push_str(&Self::linenum_color(vert_split1));
566        result.push('\n');
567      }
568
569      // [    | ]
570      result.push_str(indentation);
571      for _ in 0..row_str_len { result.push(' '); }
572      result.push_str(&Self::linenum_color(vert_split1));
573      result.push('\n');
574
575      // [row |  program text...]
576      let text = self.get_text_by_linenum(lines_to_print[i]);
577      result.push_str(indentation);
578      for _ in 0..row_str_len-lines_str[i].len() { result.push(' '); }
579      result.push_str(&Self::linenum_color(&lines_str[i]));
580      result.push_str(&Self::linenum_color(vert_split1));
581      result.push_str(&Self::text_color(&text));
582
583      // [    |    ^~~~]
584      result.push_str(indentation);
585      for _ in 0..row_str_len { result.push(' '); }
586      result.push_str(&Self::linenum_color(vert_split1));
587      let mut curr_col = 1;
588      let line_len = self.get_textlen_by_linenum(lines_to_print[i]);
589      let rngs = range_table.get(&lines_to_print[i]).unwrap();
590      for (start, len, major, cause) in rngs {
591        let max_len = usize::max(1, usize::min(*len, line_len - curr_col + 1));
592        for _ in curr_col..*start { result.push(' '); }
593        if *cause {
594          for _ in 0..max_len-1 {
595            result.push_str(&Self::error_color(tilde));
596          }
597          if *major {
598            result.push_str(&Self::error_color(arrow));
599          } else {
600            result.push_str(&Self::error_color(tilde));
601          }
602        } else {
603          if *major {
604            result.push_str(&Self::annotation_color(arrow));
605          } else {
606            result.push_str(&Self::annotation_color(tilde));
607          }
608          for _ in 0..max_len-1 {
609            result.push_str(&Self::annotation_color(tilde));
610          }
611        }
612        curr_col = start + max_len;
613      }
614      result.push('\n');
615    }
616
617    // print error message;
618    // error range never ends at first column, so it's safe to `minus 1` here
619    let cause_col = ctx.cause_rng.end.col - 1;
620    result.push_str(indentation);
621    for _ in 0..row_str_len { result.push(' '); }
622    result.push_str(vert_split2);
623    for _ in 0..cause_col-1 { result.push(' '); }
624    result.push_str(&Self::error_color(&ctx.err_message));
625    result.push('\n');
626
627    result
628  }
629
630  fn err_ending(d: usize) -> String {
631    let s = format!("... and {} other error{} not shown\n", d, if d == 1 {""} else {"s"});
632    Self::heading_color(&s)
633  }
634
635  /// Get formatted error message.
636  pub fn format_error(&self, errors: &ParserErrorReport) -> String {
637    let n = usize::min(errors.len(), 10);
638    let mut result = String::new();
639    result.push('\n');
640    for i in 0..n {
641      let ctx = &errors[i];
642      result.push_str(&Self::err_heading(i));
643      result.push_str(&self.err_location(ctx));
644      result.push_str(&self.err_context(ctx));
645      result.push_str("\n\n");
646    }
647    let d = errors.len() - n;
648    if d != 0 {
649      result.push_str(&Self::err_ending(d));
650    }
651    result
652  }
653}