1use lazy_static::lazy_static;
10use pest::iterators::Pair;
11use pest::pratt_parser::{Op, PrattParser};
12use pest::Span;
13use std::borrow::Cow;
14use std::fmt::Display;
15use thiserror::Error;
16pub mod lexical;
17pub mod syntax;
18pub mod unicode;
19
20#[derive(thiserror::Error, Debug, Clone)]
21pub enum Error<'a> {
22 #[error("internal logic error: {0}")]
23 InternalLogicalError(Cow<'a, str>),
24 #[error("multiple definition for {0}")]
25 MultipleDefinition(&'a str, pest::Span<'a>),
26 #[error("lexical reference {0} is not allowed within lexical definitions")]
27 InvalidLexicalReference(&'a str),
28 #[error("multiple skip rule detected, previous definition is {0}")]
29 MultipleSkippingRule(&'a str),
30 #[error("nullable token {0} is prohibited")]
31 NullableToken(&'a str),
32 #[error("lexical {0} is undefined")]
33 UndefinedLexicalReference(&'a str),
34 #[error("parser rule {0} is undefined")]
35 UndefinedParserRuleReference(&'a str),
36}
37
38#[macro_export]
39macro_rules! span_errors {
40 ($ekind:ident, $span:expr, $($params:expr,)*) => {
41 vec![WithSpan {
42 span: $span,
43 node: Error::$ekind ($($params,)*)
44 }]
45 };
46}
47
48macro_rules! unexpected_eoi {
49 ($expectation:literal) => {
50 $crate::frontend::GrammarDefinitionError::UnexpectedEOI(
51 $expectation.into(),
52 )
53 };
54 ($format:literal, $($arg:tt)+) => {
55 $crate::frontend::GrammarDefinitionError::UnexpectedEOI(
56 format!($format, $($arg)+).into(),
57 )
58 };
59}
60macro_rules! parser_logical_error {
61 ($expectation:expr) => {
62 $crate::frontend::GrammarDefinitionError::ParserLogicError(
63 $expectation.into(),
64 )
65 };
66 ($format:literal, $($arg:tt)+) => {
67 $crate::frontend::GrammarDefinitionError::ParserLogicError(
68 format!($format, $($arg)+).into(),
69 )
70 };
71}
72
73macro_rules! format_error {
74 ($span:expr, $message:expr) => {
75 $crate::frontend::GrammarDefinitionError::FormatError {
76 span: $span,
77 message: $message.into(),
78 }
79 };
80 ($span:expr, $format:literal, $($arg:tt)+) => {
81 $crate::frontend::GrammarDefinitionError::FormatError {
82 span: $span,
83 message: format!($format, $($arg)+),
84 }
85 };
86}
87
88mod grammar {
89 use pest_derive::Parser;
90
91 #[derive(Parser)]
92 #[grammar = "src/frontend/grammar.pest"]
93 pub struct Parser;
94}
95
96use crate::utilities::unreachable_branch;
97pub use grammar::Parser as GrammarParser;
98pub use grammar::Rule;
99
100#[derive(Debug, Error, Clone)]
101pub enum GrammarDefinitionError<'a> {
102 #[error("grammar definition error: {0}")]
103 SyntaxError(#[from] Box<pest::error::Error<Rule>>),
104 #[error("failed to parse {}: {message}", span.as_str())]
105 FormatError { span: Span<'a>, message: String },
106 #[error("{0}")]
107 ParserLogicError(Cow<'a, str>),
108 #[error("unexpected end of input, expecting {0}")]
109 UnexpectedEOI(Cow<'a, str>),
110}
111
112lazy_static! {
113 static ref PRATT_PARSER: PrattParser<Rule> = {
114 use pest::pratt_parser::Assoc::*;
115
116 PrattParser::new()
117 .op(Op::infix(Rule::lexical_alternative, Left))
118 .op(Op::infix(Rule::lexical_sequence, Left))
119 .op(Op::postfix(Rule::lexical_star)
120 | Op::postfix(Rule::lexical_plus)
121 | Op::postfix(Rule::lexical_optional)
122 | Op::prefix(Rule::lexical_not))
123 .op(Op::infix(Rule::parser_alternative, Left))
124 .op(Op::infix(Rule::parser_sequence, Left))
125 .op(Op::postfix(Rule::parser_star)
126 | Op::postfix(Rule::parser_plus)
127 | Op::postfix(Rule::parser_optional))
128 };
129}
130
131fn unescape_qouted(string: &str) -> Option<String> {
132 unescape(&string[1..string.len() - 1])
133}
134
135fn unescape(string: &str) -> Option<String> {
137 let mut result = String::new();
138 let mut chars = string.chars();
139
140 loop {
141 match chars.next() {
142 Some('\\') => match chars.next()? {
143 '"' => result.push('"'),
144 '\\' => result.push('\\'),
145 'r' => result.push('\r'),
146 'n' => result.push('\n'),
147 't' => result.push('\t'),
148 '0' => result.push('\0'),
149 '\'' => result.push('\''),
150 'x' => {
151 let string: String = chars.clone().take(2).collect();
152
153 if string.len() != 2 {
154 return None;
155 }
156
157 for _ in 0..string.len() {
158 chars.next()?;
159 }
160
161 let value = u8::from_str_radix(&string, 16).ok()?;
162
163 result.push(char::from(value));
164 }
165 'u' => {
166 if chars.next()? != '{' {
167 return None;
168 }
169
170 let string: String = chars.clone().take_while(|c| *c != '}').collect();
171
172 if string.len() < 2 || 6 < string.len() {
173 return None;
174 }
175
176 for _ in 0..string.len() + 1 {
177 chars.next()?;
178 }
179
180 let value = u32::from_str_radix(&string, 16).ok()?;
181
182 result.push(char::from_u32(value)?);
183 }
184 _ => return None,
185 },
186 Some(c) => result.push(c),
187 None => return Some(result),
188 };
189 }
190}
191
192#[derive(Debug, Clone)]
193pub struct WithSpan<'a, T> {
194 pub span: Span<'a>,
195 pub node: T,
196}
197
198impl<'a, T: Display> Display for WithSpan<'a, T> {
199 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
200 self.node.fmt(f)
201 }
202}
203pub type SpanBox<'a, T> = Box<WithSpan<'a, T>>;
204
205#[derive(Debug)]
206pub enum SurfaceSyntaxTree<'a> {
207 Grammar {
208 lexer: SpanBox<'a, Self>,
209 parser: SpanBox<'a, Self>,
210 },
211 Parser {
212 entrypoint: WithSpan<'a, ()>,
213 rules: Vec<WithSpan<'a, Self>>,
214 },
215 Lexer {
216 rules: Vec<WithSpan<'a, Self>>,
217 },
218 LexicalAlternative {
219 lhs: SpanBox<'a, Self>,
220 rhs: SpanBox<'a, Self>,
221 },
222 LexicalSequence {
223 lhs: SpanBox<'a, Self>,
224 rhs: SpanBox<'a, Self>,
225 },
226 LexicalStar {
227 inner: SpanBox<'a, Self>,
228 },
229 LexicalPlus {
230 inner: SpanBox<'a, Self>,
231 },
232 LexicalOptional {
233 inner: SpanBox<'a, Self>,
234 },
235 LexicalNot {
236 inner: SpanBox<'a, Self>,
237 },
238 ParserAlternative {
239 lhs: SpanBox<'a, Self>,
240 rhs: SpanBox<'a, Self>,
241 },
242 ParserSequence {
243 lhs: SpanBox<'a, Self>,
244 rhs: SpanBox<'a, Self>,
245 },
246 ParserStar {
247 inner: SpanBox<'a, Self>,
248 },
249 ParserPlus {
250 inner: SpanBox<'a, Self>,
251 },
252 ParserOptional {
253 inner: SpanBox<'a, Self>,
254 },
255 LexicalDefinition {
256 name: WithSpan<'a, ()>,
257 expr: SpanBox<'a, Self>,
258 },
259 LexicalToken {
260 active: bool,
261 name: WithSpan<'a, ()>,
262 expr: SpanBox<'a, Self>,
263 },
264 Range {
265 start: char,
266 end: char,
267 },
268 String(String),
269 Bottom,
270 Empty,
271 Char {
272 value: WithSpan<'a, char>,
273 },
274 ParserDefinition {
275 active: bool,
276 name: WithSpan<'a, ()>,
277 expr: SpanBox<'a, Self>,
278 },
279 ParserFixpoint {
280 active: bool,
281 name: WithSpan<'a, ()>,
282 expr: SpanBox<'a, Self>,
283 },
284 ParserRuleRef {
285 name: WithSpan<'a, ()>,
286 },
287 LexicalRuleRef {
288 name: WithSpan<'a, ()>,
289 },
290}
291
292fn parse_surface_syntax<'a, I: Iterator<Item = Pair<'a, Rule>>>(
293 pairs: I,
294 pratt: &PrattParser<Rule>,
295 src: &'a str,
296) -> Result<WithSpan<'a, SurfaceSyntaxTree<'a>>, GrammarDefinitionError<'a>> {
297 pratt
298 .map_primary(|primary| {
299 let span = primary.as_span();
300 match primary.as_rule() {
301 Rule::grammar => {
302 let mut grammar = primary.into_inner();
303 let lexer = grammar.next().ok_or_else(|| unexpected_eoi!("lexer"))?;
304 let parser = grammar.next().ok_or_else(|| unexpected_eoi!("parser"))?;
305 let lexer = parse_surface_syntax([lexer].into_iter(), pratt, src)?;
306 let parser = parse_surface_syntax([parser].into_iter(), pratt, src)?;
307 Ok(WithSpan {
308 span,
309 node: SurfaceSyntaxTree::Grammar {
310 lexer: Box::new(lexer),
311 parser: Box::new(parser),
312 },
313 })
314 }
315 Rule::lexer_def => {
316 let lexer_rules = primary
317 .into_inner()
318 .next()
319 .ok_or_else(|| unexpected_eoi!("lexer rules"))?;
320 let rules = lexer_rules.into_inner().fold(Ok(Vec::new()), |acc, rule| {
321 acc.and_then(|vec| {
322 parse_surface_syntax([rule].into_iter(), pratt, src).map(|rule| {
323 let mut vec = vec;
324 vec.push(rule);
325 vec
326 })
327 })
328 })?;
329 Ok(WithSpan {
330 span,
331 node: SurfaceSyntaxTree::Lexer { rules },
332 })
333 }
334 Rule::lexical_definition => {
335 let mut definition = primary.into_inner();
336 let name = definition
337 .next()
338 .ok_or_else(|| unexpected_eoi!("name for lexical definition"))?;
339 let expr = definition
340 .next()
341 .ok_or_else(|| unexpected_eoi!("expr for lexical definition"))?;
342 let name = WithSpan {
343 span: name.as_span(),
344 node: (),
345 };
346 let expr = parse_surface_syntax(expr.into_inner(), pratt, src)?;
347 Ok(WithSpan {
348 span,
349 node: SurfaceSyntaxTree::LexicalDefinition {
350 name,
351 expr: Box::new(expr),
352 },
353 })
354 }
355 Rule::range => {
356 let mut primary = primary.into_inner();
357 let start = primary
358 .next()
359 .ok_or_else(|| unexpected_eoi!("start character for range"))?;
360 let end = primary
361 .next()
362 .ok_or_else(|| unexpected_eoi!("end character for range"))?;
363 let start = unescape_qouted(start.as_str())
364 .ok_or_else(|| format_error!(span, "failed to unescape character"))?
365 .parse()
366 .map_err(|e| format_error!(span, "{}", e))?;
367 let end = unescape_qouted(end.as_str())
368 .ok_or_else(|| format_error!(span, "failed to unescape character"))?
369 .parse()
370 .map_err(|e| format_error!(span, "{}", e))?;
371 Ok(WithSpan {
372 span,
373 node: SurfaceSyntaxTree::Range { start, end },
374 })
375 }
376 Rule::string => {
377 let value = unescape_qouted(primary.as_str())
378 .ok_or_else(|| format_error!(span, "failed to unescape string"))?;
379 Ok(WithSpan {
380 span,
381 node: SurfaceSyntaxTree::String(value),
382 })
383 }
384 Rule::lexical_expr => parse_surface_syntax(primary.into_inner(), pratt, src),
385 Rule::active_token | Rule::silent_token => {
386 let active = matches!(primary.as_rule(), Rule::active_token);
387 let mut token = primary.into_inner();
388 let name = token
389 .next()
390 .ok_or_else(|| unexpected_eoi!("name for token rule"))?;
391 let expr = token
392 .next()
393 .ok_or_else(|| unexpected_eoi!("expr for token rule"))?;
394 let name = WithSpan {
395 span: name.as_span(),
396 node: (),
397 };
398 let expr = parse_surface_syntax(expr.into_inner(), pratt, src)?;
399 Ok(WithSpan {
400 span,
401 node: SurfaceSyntaxTree::LexicalToken {
402 active,
403 name,
404 expr: Box::new(expr),
405 },
406 })
407 }
408 Rule::character => {
409 let character = unescape_qouted(primary.as_str())
410 .ok_or_else(|| format_error!(span, "failed to unescape character"))?
411 .parse()
412 .map_err(|e| format_error!(span, "{}", e))?;
413 Ok(WithSpan {
414 span,
415 node: SurfaceSyntaxTree::Char {
416 value: WithSpan {
417 span,
418 node: character,
419 },
420 },
421 })
422 }
423 Rule::token_id => {
424 Ok(WithSpan {
426 span,
427 node: SurfaceSyntaxTree::LexicalRuleRef {
428 name: WithSpan { span, node: () },
429 },
430 })
431 }
432 Rule::parser_id => {
433 Ok(WithSpan {
435 span,
436 node: SurfaceSyntaxTree::ParserRuleRef {
437 name: WithSpan { span, node: () },
438 },
439 })
440 }
441 Rule::bottom => {
442 Ok(WithSpan {
444 span,
445 node: SurfaceSyntaxTree::Bottom,
446 })
447 }
448 Rule::empty => {
449 Ok(WithSpan {
451 span,
452 node: SurfaceSyntaxTree::Empty,
453 })
454 }
455 Rule::parser_def => {
456 let mut parser_rules = primary.into_inner();
457 let entrypoint = parser_rules
458 .next()
459 .ok_or_else(|| unexpected_eoi!("entrypoint for parser"))?;
460 let entrypoint = WithSpan {
461 span: entrypoint.as_span(),
462 node: (),
463 };
464 let parser_rules = parser_rules
465 .next()
466 .ok_or_else(|| unexpected_eoi!("parser rules"))?;
467 let rules = parser_rules
468 .into_inner()
469 .fold(Ok(Vec::new()), |acc, rule| {
470 acc.and_then(|vec| {
471 parse_surface_syntax([rule].into_iter(), pratt, src).map(|rule| {
472 let mut vec = vec;
473 vec.push(rule);
474 vec
475 })
476 })
477 })?;
478 Ok(WithSpan {
479 span,
480 node: SurfaceSyntaxTree::Parser { entrypoint, rules },
481 })
482 }
483 Rule::active_parser_definition | Rule::silent_parser_definition => {
484 let active = matches!(primary.as_rule(), Rule::active_parser_definition);
485 let mut definition = primary.into_inner();
486 let name = definition
487 .next()
488 .ok_or_else(|| unexpected_eoi!("name for token rule"))?;
489 let expr = definition
490 .next()
491 .ok_or_else(|| unexpected_eoi!("expr for token rule"))?;
492 let name = WithSpan {
493 span: name.as_span(),
494 node: (),
495 };
496 let expr = parse_surface_syntax(expr.into_inner(), pratt, src)?;
497 Ok(WithSpan {
498 span,
499 node: SurfaceSyntaxTree::ParserDefinition {
500 active,
501 name,
502 expr: Box::new(expr),
503 },
504 })
505 }
506 Rule::active_parser_fixpoint | Rule::silent_parser_fixpoint => {
507 let active = matches!(primary.as_rule(), Rule::active_parser_fixpoint);
508 let mut fixpoint = primary.into_inner();
509 let name = fixpoint
510 .next()
511 .ok_or_else(|| unexpected_eoi!("name for token rule"))?;
512 let expr = fixpoint
513 .next()
514 .ok_or_else(|| unexpected_eoi!("expr for token rule"))?;
515 let name = WithSpan {
516 span: name.as_span(),
517 node: (),
518 };
519 let expr = parse_surface_syntax(expr.into_inner(), pratt, src)?;
520 Ok(WithSpan {
521 span,
522 node: SurfaceSyntaxTree::ParserFixpoint {
523 active,
524 name,
525 expr: Box::new(expr),
526 },
527 })
528 }
529 Rule::parser_expr => parse_surface_syntax(primary.into_inner(), pratt, src),
530 _ => {
531 unreachable_branch!("undefined primary rule: {:?}", primary.as_rule())
532 }
533 }
534 })
535 .map_prefix(|op, operand| {
536 let operand = operand?;
537 match op.as_rule() {
538 Rule::lexical_not => {
539 let total_span = Span::new(src, op.as_span().start(), operand.span.end())
540 .ok_or_else(|| parser_logical_error!("invalid span"))?;
541 Ok(WithSpan {
542 span: total_span,
543 node: SurfaceSyntaxTree::LexicalNot {
544 inner: Box::new(operand),
545 },
546 })
547 }
548 _ => unreachable_branch!("only lexical not is supported as a prefix operator"),
549 }
550 })
551 .map_infix(|lhs, op, rhs| {
552 let lhs = lhs?;
553 let rhs = rhs?;
554 let total_span = Span::new(src, lhs.span.start(), rhs.span.end())
555 .ok_or_else(|| parser_logical_error!("invalid span"))?;
556 match op.as_rule() {
557 Rule::lexical_alternative => Ok(WithSpan {
558 span: total_span,
559 node: SurfaceSyntaxTree::LexicalAlternative {
560 lhs: Box::new(lhs),
561 rhs: Box::new(rhs),
562 },
563 }),
564 Rule::lexical_sequence => Ok(WithSpan {
565 span: total_span,
566 node: SurfaceSyntaxTree::LexicalSequence {
567 lhs: Box::new(lhs),
568 rhs: Box::new(rhs),
569 },
570 }),
571 Rule::parser_alternative => Ok(WithSpan {
572 span: total_span,
573 node: SurfaceSyntaxTree::ParserAlternative {
574 lhs: Box::new(lhs),
575 rhs: Box::new(rhs),
576 },
577 }),
578 Rule::parser_sequence => Ok(WithSpan {
579 span: total_span,
580 node: SurfaceSyntaxTree::ParserSequence {
581 lhs: Box::new(lhs),
582 rhs: Box::new(rhs),
583 },
584 }),
585
586 _ => unreachable_branch!("Operator {} is not an infix operator", op.as_str()),
587 }
588 })
589 .map_postfix(|expr, op| {
590 let expr = expr?;
591 let op_span = op.as_span();
592 let total_span = Span::new(src, expr.span.start(), op_span.end())
593 .ok_or_else(|| unexpected_eoi!("invalid span"))?;
594 match op.as_rule() {
595 Rule::lexical_plus => Ok(WithSpan {
596 span: total_span,
597 node: SurfaceSyntaxTree::LexicalPlus {
598 inner: Box::new(expr),
599 },
600 }),
601 Rule::lexical_star => Ok(WithSpan {
602 span: total_span,
603 node: SurfaceSyntaxTree::LexicalStar {
604 inner: Box::new(expr),
605 },
606 }),
607 Rule::lexical_optional => Ok(WithSpan {
608 span: total_span,
609 node: SurfaceSyntaxTree::LexicalOptional {
610 inner: Box::new(expr),
611 },
612 }),
613 Rule::parser_plus => Ok(WithSpan {
614 span: total_span,
615 node: SurfaceSyntaxTree::ParserPlus {
616 inner: Box::new(expr),
617 },
618 }),
619 Rule::parser_star => Ok(WithSpan {
620 span: total_span,
621 node: SurfaceSyntaxTree::ParserStar {
622 inner: Box::new(expr),
623 },
624 }),
625 Rule::parser_optional => Ok(WithSpan {
626 span: total_span,
627 node: SurfaceSyntaxTree::ParserOptional {
628 inner: Box::new(expr),
629 },
630 }),
631 _ => unreachable_branch!("Operator {} is not a postfix operator", op.as_str()),
632 }
633 })
634 .parse(pairs)
635}
636
637pub fn parse(input: &str) -> Result<WithSpan<SurfaceSyntaxTree>, crate::Error> {
638 match <GrammarParser as pest::Parser<Rule>>::parse(Rule::grammar, input) {
639 Ok(pairs) => parse_surface_syntax(pairs, &PRATT_PARSER, input)
640 .map_err(crate::Error::GrammarDefinitionError),
641 Err(e) => Err(crate::Error::GrammarDefinitionError(
642 GrammarDefinitionError::SyntaxError(Box::new(e)),
643 )),
644 }
645}
646
647#[cfg(test)]
648mod test {
649 use std::mem::size_of;
650
651 use ariadne::Source;
652 use pest::Parser;
653 use typed_arena::Arena;
654
655 use crate::{
656 core_syntax::TermArena,
657 frontend::lexical::LexerDatabase,
658 fusion::fusion_parser,
659 nf::{
660 fully_normalize, merge_inactive_rules, remove_unreachable_rules, semi_normalize,
661 NormalForm, NormalForms, Tag, TagAssigner,
662 },
663 };
664
665 use super::{syntax::construct_parser, *};
666
667 const TEST: &str = include_str!("example.pag");
668
669 #[test]
670 fn it_parses_lexical_expr() {
671 let nfs_size = |nfs: &NormalForms| nfs.entries.values().map(|v| v.len()).sum::<usize>();
672
673 dbg!(size_of::<NormalForm>());
674 let pairs = GrammarParser::parse(Rule::grammar, TEST).unwrap();
675 let tree = parse_surface_syntax(pairs, &PRATT_PARSER, TEST).unwrap();
676 let SurfaceSyntaxTree::Grammar { lexer, parser } = &tree.node else { unreachable!() };
677
678 let database = LexerDatabase::new(lexer).unwrap();
679 for (i, rule) in database.entries.iter() {
680 println!("{i} ::= {}, active = {}", rule.rule, rule.active)
681 }
682 println!("----");
683
684 let arena = TermArena::new();
685 let parser = construct_parser(&arena, database, parser).unwrap();
686 for (i, rule) in parser.bindings.iter() {
687 println!("{i} ::= {}, active = {}", rule.term, rule.active)
688 }
689 let errs = parser.type_check();
690
691 let liberr = crate::Error::from(errs);
692 let reports = liberr.to_reports("example.pag");
693 let mut src = ("example.pag", Source::from(TEST));
694 for i in reports.iter() {
695 i.eprint(&mut src).unwrap();
696 }
697 assert!(reports.is_empty());
698 println!("----");
699
700 let nf_arena = Arena::new();
701 let mut nfs = NormalForms::new();
702 let mut assigner = TagAssigner::new();
703
704 for (i, rule) in parser.bindings.iter() {
705 semi_normalize(
706 &rule.term.node,
707 Tag::new(*i),
708 &nf_arena,
709 &mut nfs,
710 &mut assigner,
711 &parser,
712 );
713 }
714 dbg!(nfs_size(&nfs));
715 println!("{}", nfs);
716 println!("----");
717
718 fully_normalize(&nf_arena, &mut nfs);
719 dbg!(nfs_size(&nfs));
720 println!("{}", nfs);
721 println!("----");
722
723 merge_inactive_rules(&mut nfs, &parser, &nf_arena);
724 dbg!(nfs_size(&nfs));
725 println!("{}", nfs);
726 println!("----");
727
728 remove_unreachable_rules(&mut nfs, &parser);
729 dbg!(nfs_size(&nfs));
730 println!("{}", nfs);
731 println!("----");
732
733 let parser = fusion_parser(&nfs, &parser);
734 println!("{}", parser);
735 }
737}