pag_parser/frontend/
lexical.rs

1// Copyright (c) 2023 Paguroidea Developers
2//
3// Licensed under the Apache License, Version 2.0
4// <LICENSE-APACHE or http://www.apache.org/licenses/LICENSE-2.0> or the MIT
5// license <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6// option. All files in the project carrying such notice may not be copied,
7// modified, or distributed except according to those terms.
8
9use std::{collections::HashMap, rc::Rc};
10
11use pag_lexer::{normalization::normalize, regex_tree::RegexTree};
12
13use crate::frontend::unicode;
14use crate::span_errors;
15use crate::{utilities::unreachable_branch, utilities::Symbol};
16
17use super::{SurfaceSyntaxTree, WithSpan};
18
19use super::Error;
20
21type SpanRegexTree<'a> = WithSpan<'a, Rc<RegexTree>>;
22
23pub struct LexerRule<'a> {
24    pub active: bool,
25    pub rule: SpanRegexTree<'a>,
26}
27
28pub struct LexerDatabase<'a> {
29    pub symbol_table: HashMap<&'a str, Symbol<'a>>,
30    pub entries: HashMap<Symbol<'a>, LexerRule<'a>>,
31    pub skip: Option<Symbol<'a>>,
32}
33
34impl<'a> LexerDatabase<'a> {
35    pub fn new(
36        sst: &WithSpan<'a, SurfaceSyntaxTree<'a>>,
37    ) -> Result<Self, Vec<WithSpan<'a, Error<'a>>>> {
38        TranslationContext::create_database(sst)
39    }
40    pub fn nullability_check(&self) -> Vec<WithSpan<'a, Error<'a>>> {
41        let mut errors = Vec::new();
42        for (sym, rule) in self.entries.iter() {
43            if rule.rule.node.is_nullable() {
44                errors.push(WithSpan {
45                    span: rule.rule.span,
46                    node: Error::NullableToken(sym.name()),
47                });
48            }
49        }
50        errors
51    }
52}
53
54struct TranslationContext<'a> {
55    definitions: HashMap<&'a str, SpanRegexTree<'a>>,
56    database: LexerDatabase<'a>,
57}
58
59fn construct_regex_tree<'a, F>(
60    sst: &WithSpan<'a, SurfaceSyntaxTree<'a>>,
61    reference_handler: &F,
62) -> Result<Rc<RegexTree>, Vec<WithSpan<'a, Error<'a>>>>
63where
64    F: Fn(WithSpan<'a, ()>) -> Result<Rc<RegexTree>, Vec<WithSpan<'a, Error<'a>>>>,
65{
66    match &sst.node {
67        SurfaceSyntaxTree::LexicalAlternative { lhs, rhs } => {
68            let lhs = construct_regex_tree(lhs, reference_handler);
69            let rhs = construct_regex_tree(rhs, reference_handler);
70            match (lhs, rhs) {
71                (Err(mut lhs), Err(rhs)) => {
72                    lhs.extend(rhs.into_iter());
73                    Err(lhs)
74                }
75                (Err(lhs), _) => Err(lhs),
76                (_, Err(rhs)) => Err(rhs),
77                (Ok(lhs), Ok(rhs)) => Ok(Rc::new(RegexTree::Union(lhs, rhs))),
78            }
79        }
80        SurfaceSyntaxTree::LexicalSequence { lhs, rhs } => {
81            let lhs = construct_regex_tree(lhs, reference_handler);
82            let rhs = construct_regex_tree(rhs, reference_handler);
83            match (lhs, rhs) {
84                (Err(mut lhs), Err(rhs)) => {
85                    lhs.extend(rhs.into_iter());
86                    Err(lhs)
87                }
88                (Err(lhs), _) => Err(lhs),
89                (_, Err(rhs)) => Err(rhs),
90                (Ok(lhs), Ok(rhs)) => Ok(Rc::new(RegexTree::Concat(lhs, rhs))),
91            }
92        }
93        SurfaceSyntaxTree::LexicalStar { inner } => {
94            let inner = construct_regex_tree(inner, reference_handler)?;
95            Ok(Rc::new(RegexTree::KleeneClosure(inner)))
96        }
97        SurfaceSyntaxTree::LexicalPlus { inner } => {
98            let inner = construct_regex_tree(inner, reference_handler)?;
99            Ok(Rc::new(RegexTree::Concat(
100                inner.clone(),
101                Rc::new(RegexTree::KleeneClosure(inner)),
102            )))
103        }
104        SurfaceSyntaxTree::LexicalOptional { inner } => {
105            let inner = construct_regex_tree(inner, reference_handler)?;
106            Ok(Rc::new(RegexTree::Union(
107                inner,
108                Rc::new(RegexTree::Epsilon),
109            )))
110        }
111        SurfaceSyntaxTree::LexicalNot { inner } => {
112            let inner = construct_regex_tree(inner, reference_handler)?;
113            Ok(Rc::new(RegexTree::Complement(inner)))
114        }
115        SurfaceSyntaxTree::Range { start, end } => Ok(unicode::encode_range(*start, *end)),
116        SurfaceSyntaxTree::String(x) => {
117            if x.is_empty() {
118                Ok(Rc::new(RegexTree::Epsilon))
119            } else {
120                let mut iter = x.bytes();
121                let fst = Rc::new(RegexTree::single(unsafe { iter.next().unwrap_unchecked() }));
122                Ok(iter.fold(fst, |acc, c| {
123                    Rc::new(RegexTree::Concat(acc, Rc::new(RegexTree::single(c))))
124                }))
125            }
126        }
127        SurfaceSyntaxTree::Bottom => Ok(Rc::new(RegexTree::Bottom)),
128        SurfaceSyntaxTree::Empty => Ok(Rc::new(RegexTree::Epsilon)),
129        SurfaceSyntaxTree::Char { value } => Ok(unicode::encode_char(value.node)),
130        SurfaceSyntaxTree::LexicalRuleRef { name } => reference_handler(name.clone()),
131        _ => unreachable_branch!(
132            "lexer translation is called with unsupported code: {}",
133            sst.span.as_str()
134        ),
135    }
136}
137
138fn construct_definition<'a>(
139    sst: &WithSpan<'a, SurfaceSyntaxTree<'a>>,
140) -> Result<(&'a str, SpanRegexTree<'a>), Vec<WithSpan<'a, Error<'a>>>> {
141    fn reference_handler(
142        name: WithSpan<'_, ()>,
143    ) -> Result<Rc<RegexTree>, Vec<WithSpan<'_, Error<'_>>>> {
144        Err(span_errors!(
145            InvalidLexicalReference,
146            name.span,
147            name.span.as_str(),
148        ))
149    }
150    match &sst.node {
151        SurfaceSyntaxTree::LexicalDefinition { name, expr } => {
152            let name = name.span.as_str();
153            let tree = construct_regex_tree(expr, &reference_handler)?;
154            Ok((
155                name,
156                WithSpan {
157                    span: expr.span,
158                    node: normalize(tree),
159                },
160            ))
161        }
162        _ => unreachable_branch!(
163            "lexer translation can only be called with lexical definition: {}",
164            sst.span.as_str()
165        ),
166    }
167}
168
169fn construct_lexical_rule<'a>(
170    definitions: &HashMap<&'a str, SpanRegexTree<'a>>,
171    sst: &WithSpan<'a, SurfaceSyntaxTree<'a>>,
172) -> Result<(Symbol<'a>, SpanRegexTree<'a>), Vec<WithSpan<'a, Error<'a>>>> {
173    let reference_handler = |name: WithSpan<'a, ()>| match definitions.get(name.span.as_str()) {
174        Some(x) => Ok(x.node.clone()),
175        _ => Err(span_errors!(
176            UndefinedLexicalReference,
177            name.span,
178            name.span.as_str(),
179        )),
180    };
181    match &sst.node {
182        SurfaceSyntaxTree::LexicalToken { name, expr, .. } => {
183            let name = name.span.as_str();
184            let tree = construct_regex_tree(expr, &reference_handler)?;
185            Ok((
186                Symbol::new(name),
187                WithSpan {
188                    span: expr.span,
189                    node: normalize(tree),
190                },
191            ))
192        }
193        _ => unreachable_branch!(
194            "lexer rule translation can only be called with lexical definition: {}",
195            sst.span.as_str()
196        ),
197    }
198}
199
200impl<'a> TranslationContext<'a> {
201    fn new() -> Self {
202        TranslationContext {
203            definitions: HashMap::new(),
204            database: LexerDatabase {
205                entries: HashMap::new(),
206                symbol_table: HashMap::new(),
207                skip: None,
208            },
209        }
210    }
211    fn populate_definitions(
212        &mut self,
213        sst: &WithSpan<'a, SurfaceSyntaxTree<'a>>,
214    ) -> Result<(), Vec<WithSpan<'a, Error<'a>>>> {
215        match &sst.node {
216            SurfaceSyntaxTree::Lexer { rules } => {
217                let mut error = Vec::new();
218                for i in rules
219                    .iter()
220                    .filter(|x| matches!(x.node, SurfaceSyntaxTree::LexicalDefinition { .. }))
221                    .map(construct_definition)
222                {
223                    match i {
224                        Err(e) => {
225                            error.extend(e.into_iter());
226                        }
227                        Ok((k, v)) => {
228                            let current_span = v.span;
229                            match self.definitions.insert(k, v) {
230                                None => (),
231                                Some(x) => {
232                                    error.push(WithSpan {
233                                        span: current_span,
234                                        node: Error::MultipleDefinition(k, x.span),
235                                    });
236                                }
237                            }
238                        }
239                    }
240                }
241
242                if error.is_empty() {
243                    Ok(())
244                } else {
245                    Err(error)
246                }
247            }
248            _ => Err(span_errors!(
249                InternalLogicalError,
250                sst.span,
251                "populating definitions from a non-lexer node".into(),
252            )),
253        }
254    }
255    fn create_database(
256        sst: &WithSpan<'a, SurfaceSyntaxTree<'a>>,
257    ) -> Result<LexerDatabase<'a>, Vec<WithSpan<'a, Error<'a>>>> {
258        let mut ctx = Self::new();
259        let mut errs = match ctx.populate_definitions(sst) {
260            Ok(_) => Vec::new(),
261            Err(errs) => errs,
262        };
263        match &sst.node {
264            SurfaceSyntaxTree::Lexer { rules } => {
265                for i in rules
266                    .iter()
267                    .filter_map(|x| match &x.node {
268                        SurfaceSyntaxTree::LexicalToken { active, .. } => Some((*active, x)),
269                        _ => None,
270                    })
271                    .map(|(active, sst)| (active, construct_lexical_rule(&ctx.definitions, sst)))
272                {
273                    match i.1 {
274                        Err(e) => {
275                            errs.extend(e.into_iter());
276                        }
277                        Ok((k, rule)) => {
278                            if !i.0 {
279                                if let Some(x) = ctx.database.skip.replace(k) {
280                                    errs.push(WithSpan {
281                                        span: rule.span,
282                                        node: Error::MultipleSkippingRule(x.name()),
283                                    });
284                                }
285                            }
286                            let current_span = rule.span;
287                            match ctx.database.symbol_table.insert(k.name(), k) {
288                                None => {
289                                    ctx.database
290                                        .entries
291                                        .insert(k, LexerRule { active: i.0, rule });
292                                }
293                                Some(x) => {
294                                    errs.push(WithSpan {
295                                        span: current_span,
296                                        node: Error::MultipleDefinition(k.name(), unsafe {
297                                            ctx.database
298                                                .entries
299                                                .get(&x)
300                                                .unwrap_unchecked()
301                                                .rule
302                                                .span
303                                        }),
304                                    });
305                                }
306                            }
307                        }
308                    }
309                }
310
311                if errs.is_empty() {
312                    Ok(ctx.database)
313                } else {
314                    Err(errs)
315                }
316            }
317            _ => Err(span_errors!(
318                InternalLogicalError,
319                sst.span,
320                "creating lexer rule database from a non-lexer node".into(),
321            )),
322        }
323    }
324}