fea_rs/parse/
context.rs

1//! parsing and resolving includes
2
3use std::{
4    collections::{HashMap, HashSet},
5    ops::Range,
6    path::{Path, PathBuf},
7    sync::Arc,
8};
9
10use super::source::{Source, SourceLoadError, SourceLoader, SourceResolver};
11use super::{FileId, ParseTree, Parser, SourceList, SourceMap};
12use crate::{
13    token_tree::{
14        typed::{self, AstNode as _},
15        AstSink,
16    },
17    Diagnostic, DiagnosticSet, GlyphMap, Node,
18};
19
20const MAX_INCLUDE_DEPTH: usize = 50;
21
22/// Oversees parsing, following, resolving and validating input statements.
23///
24/// Includes are annoying. Existing tools tend to handle them as they're
25/// encountered, pushing another parser onto the stack and building the tree
26/// in-place. This doesn't work for us, because we want to be able to preserve
27/// the original source locations for tokens so that we can provide good error
28/// messages.
29///
30/// We handle this in a reasonably straight-forward way: instead of parsing
31/// includes immediately, we return a list of the includes found in each
32/// source file. We use these to build a graph of include statements, and then
33/// we also add these files to a queue, skipping files that have been parsed
34/// already. Importantly, we don't worry about recursion or include depth
35/// at parse time; we just parse every file we find, and if there's a cycle
36/// we avoid it by keeping track of what we've already parsed.
37///
38/// Once parsing is finished, we use our `IncludeGraph` to validate that there
39/// are no cycles, and that the depth limit is not exceeded.
40///
41/// After parsing, you use [`generate_parse_tree`] to validate and assemble
42/// the parsed sources into a single parse tree. This is also where validation
43/// occurs; if there are any errors in include statements, those statements
44/// are ignored when the tree is built, and an error is recorded: however we will
45/// always attempt to construct *some* tree.
46///
47/// In the future, it should be possible to augment this type to allow for use by
48/// a language server or similar, where an edit could be applied to a particular
49/// source, and then only that source would need to be recompiled.
50///
51/// But we don't take advantage of that, yet, so this is currently more like
52/// an intermediate type for generating a `ParseTree`.
53///
54/// [`generate_parse_tree`]: ParseContext::generate_parse_tree
55#[derive(Debug)]
56pub(crate) struct ParseContext {
57    root_id: FileId,
58    sources: Arc<SourceList>,
59    parsed_files: HashMap<FileId, (Node, Vec<Diagnostic>)>,
60    graph: IncludeGraph,
61}
62
63/// A simple graph of files and their includes.
64///
65/// We maintain this in order to validate that the input does not contain
66/// any cyclical include statements, and does not exceed the maximum include
67/// depth of 50.
68#[derive(Clone, Debug, Default)]
69struct IncludeGraph {
70    // source file -> (destination file, span-in-source-for-error)
71    nodes: HashMap<FileId, Vec<(FileId, Range<usize>)>>,
72}
73
74/// An include statement in a source file.
75pub struct IncludeStatement(pub(crate) typed::Include);
76
77struct IncludeError {
78    file: FileId,
79    /// the index of the problem statement, in the list of that file's includes
80    statement_idx: usize,
81    range: Range<usize>,
82    kind: IncludeErrorKind,
83}
84
85enum IncludeErrorKind {
86    Cycle,
87    ToDeep,
88}
89
90impl IncludeStatement {
91    /// The path part of the statement.
92    ///
93    /// For the statement `include(file.fea)`, this is `file.fea`.
94    fn path(&self) -> &str {
95        &self.0.path().text
96    }
97
98    /// The range of the entire include statement.
99    fn stmt_range(&self) -> Range<usize> {
100        self.0.range()
101    }
102
103    /// The range of just the path text.
104    fn path_range(&self) -> Range<usize> {
105        self.0.path().range()
106    }
107}
108
109impl ParseContext {
110    /// Attempt to parse the feature file at `path` and any includes.
111    ///
112    /// This will only error in unrecoverable cases, such as if `path` cannot
113    /// be read.
114    ///
115    /// After parsing, you can call [`generate_parse_tree`] in order to generate
116    /// a unified parse tree suitable for compilation.
117    ///
118    /// [`generate_parse_tree`]: ParseContext::generate_parse_tree
119    pub(crate) fn parse(
120        path: PathBuf,
121        glyph_map: Option<&GlyphMap>,
122        resolver: Box<dyn SourceResolver>,
123    ) -> Result<Self, SourceLoadError> {
124        let mut sources = SourceLoader::new(resolver);
125        let root_id = sources.source_for_path(&path, None)?;
126        let mut queue = vec![root_id];
127        let mut parsed_files = HashMap::new();
128        let mut includes = IncludeGraph::default();
129
130        while let Some(id) = queue.pop() {
131            // skip things we've already parsed.
132            if parsed_files.contains_key(&id) {
133                continue;
134            }
135            let source = sources.get(&id).unwrap();
136            let (node, mut errors, include_stmts) = parse_src(source, glyph_map);
137            errors.iter_mut().for_each(|e| e.message.file = id);
138
139            parsed_files.insert(source.id(), (node, errors));
140            if include_stmts.is_empty() {
141                continue;
142            }
143
144            // we need to drop `source` so we can mutate source_map below
145            let source_id = source.id();
146
147            for include in &include_stmts {
148                match sources.source_for_path(Path::new(include.path()), Some(source_id)) {
149                    Ok(included_id) => {
150                        includes.add_edge(id, (included_id, include.stmt_range()));
151                        queue.push(included_id);
152                    }
153                    Err(e) => {
154                        let range = include.path_range();
155                        parsed_files.get_mut(&id).unwrap().1.push(Diagnostic::error(
156                            id,
157                            range,
158                            e.to_string(),
159                        ));
160                    }
161                }
162            }
163        }
164
165        Ok(ParseContext {
166            root_id,
167            sources: sources.into_inner(),
168            parsed_files,
169            graph: includes,
170        })
171    }
172
173    pub(crate) fn root_id(&self) -> FileId {
174        self.root_id
175    }
176
177    /// Construct a `ParseTree`, and return any diagnostics.
178    ///
179    /// This method also performs validation of include statements.
180    pub(crate) fn generate_parse_tree(self) -> (ParseTree, DiagnosticSet) {
181        let mut all_errors = self
182            .parsed_files
183            .iter()
184            .flat_map(|(_, (_, errs))| errs.iter())
185            .cloned()
186            .collect::<Vec<_>>();
187        let include_errors = self.graph.validate(self.root_id());
188        // record any errors:
189        for IncludeError {
190            file, range, kind, ..
191        } in &include_errors
192        {
193            // find statement
194            let message = match kind {
195                IncludeErrorKind::Cycle => "cyclical include statement",
196                IncludeErrorKind::ToDeep => "exceded maximum include depth",
197            };
198            all_errors.push(Diagnostic::error(*file, range.clone(), message));
199        }
200
201        let mut map = SourceMap::default();
202        let mut root = self.generate_recurse(self.root_id(), &include_errors, &mut map, 0);
203        let needs_update_positions = self.parsed_files.len() > 1;
204        // we need to do this before updating positions, since it mutates and
205        // requires that there exist only one reference (via Arc) to the node
206        drop(self.parsed_files);
207        if needs_update_positions {
208            root.update_positions_from_root();
209        }
210
211        let diagnostics = DiagnosticSet {
212            messages: all_errors,
213            sources: self.sources.clone(),
214            max_to_print: usize::MAX,
215        };
216
217        (
218            ParseTree {
219                root,
220                map: Arc::new(map),
221                sources: self.sources,
222            },
223            diagnostics,
224        )
225    }
226
227    /// recursively construct the output tree.
228    fn generate_recurse(
229        &self,
230        id: FileId,
231        skip: &[IncludeError],
232        source_map: &mut SourceMap,
233        offset: usize,
234    ) -> Node {
235        let this_node = self.parsed_files[&id].0.clone();
236        let self_len = this_node.text_len();
237        let mut self_pos = 0;
238        let mut global_pos = offset;
239        let this_node = match self.graph.includes_for_file(id) {
240            Some(includes) => {
241                let mut edits = Vec::with_capacity(includes.len());
242
243                for (i, (child_id, stmt)) in includes.iter().enumerate() {
244                    if skip
245                        .iter()
246                        .any(|err| err.file == id && err.statement_idx == i)
247                    {
248                        continue;
249                    }
250                    // add everything up to this attach to the sourcemap
251                    let pre_len = stmt.start - self_pos;
252                    let pre_range = global_pos..global_pos + pre_len;
253                    source_map.add_entry(pre_range, (id, self_pos));
254                    self_pos = stmt.end;
255                    global_pos += pre_len;
256                    let child_node = self.generate_recurse(*child_id, skip, source_map, global_pos);
257                    global_pos += child_node.text_len();
258                    edits.push((stmt.clone(), child_node));
259                }
260                this_node.edit(edits, true)
261            }
262            None => this_node,
263        };
264        // now add any remaining contents to source_map
265        let remain_len = self_len - self_pos;
266        let remaining_range = global_pos..global_pos + remain_len;
267        source_map.add_entry(remaining_range, (id, self_pos));
268        this_node
269    }
270}
271
272impl IncludeGraph {
273    fn add_edge(&mut self, from: FileId, to: (FileId, Range<usize>)) {
274        self.nodes.entry(from).or_default().push(to);
275    }
276
277    fn includes_for_file(&self, file: FileId) -> Option<&[(FileId, Range<usize>)]> {
278        self.nodes.get(&file).map(|f| f.as_slice())
279    }
280
281    /// Validate the graph of include statements, returning any problems.
282    ///
283    /// If the result is non-empty, each returned error should be converted to
284    /// d to diagnostics by the caller, and those statements should
285    /// not be resolved when building the final tree.
286    fn validate(&self, root: FileId) -> Vec<IncludeError> {
287        let edges = match self.nodes.get(&root) {
288            None => return Vec::new(),
289            Some(edges) => edges,
290        };
291
292        let mut stack = vec![(root, edges, 0_usize)];
293        let mut seen = HashSet::new();
294        let mut bad_edges = Vec::new();
295
296        while let Some((node, edges, cur_edge)) = stack.pop() {
297            if let Some((child, stmt)) = edges.get(cur_edge) {
298                // push parent, advancing idx
299                stack.push((node, edges, cur_edge + 1));
300                if stack.len() >= MAX_INCLUDE_DEPTH - 1 {
301                    bad_edges.push(IncludeError {
302                        file: node,
303                        statement_idx: cur_edge,
304                        range: stmt.clone(),
305                        kind: IncludeErrorKind::ToDeep,
306                    });
307                    continue;
308                }
309
310                // only recurse if we haven't seen this node yet
311                if seen.insert(*child) {
312                    if let Some(child_edges) = self.nodes.get(child) {
313                        stack.push((*child, child_edges, 0));
314                    }
315                } else if stack.iter().any(|(ancestor, _, _)| ancestor == child) {
316                    // we have a cycle
317                    bad_edges.push(IncludeError {
318                        file: node,
319                        statement_idx: cur_edge,
320                        range: stmt.clone(),
321                        kind: IncludeErrorKind::Cycle,
322                    });
323                }
324            }
325        }
326        bad_edges
327    }
328}
329
330/// Parse a single source file.
331pub(crate) fn parse_src(
332    src: &Source,
333    glyph_map: Option<&GlyphMap>,
334) -> (Node, Vec<Diagnostic>, Vec<IncludeStatement>) {
335    let mut sink = AstSink::new(src.text(), src.id(), glyph_map);
336    {
337        let mut parser = Parser::new(src.text(), &mut sink);
338        super::grammar::root(&mut parser);
339    }
340    sink.finish()
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346    use crate::{
347        token_tree::{typed, TreeBuilder},
348        Kind,
349    };
350
351    fn make_ids<const N: usize>() -> [FileId; N] {
352        let mut result = [FileId::CURRENT_FILE; N];
353        result.iter_mut().for_each(|id| *id = FileId::next());
354        result
355    }
356
357    /// Ensure we error if there are cyclical includes
358    #[test]
359    fn cycle_detection() {
360        let [a, b, c, d] = make_ids();
361        let statement = {
362            let mut builder = TreeBuilder::default();
363            builder.start_node(Kind::IncludeNode);
364            builder.token(Kind::IncludeKw, "include");
365            builder.token(Kind::LParen, "(");
366            builder.token(Kind::Path, "file.fea");
367            builder.token(Kind::LParen, ")");
368            builder.token(Kind::Semi, ";");
369            builder.finish_node(false, None);
370            builder.finish()
371        };
372        let statement = typed::Include::cast(&statement.into()).unwrap();
373        let mut graph = IncludeGraph::default();
374        graph.add_edge(a, (b, statement.range()));
375        graph.add_edge(b, (c, statement.range()));
376        graph.add_edge(c, (d, statement.range()));
377        graph.add_edge(d, (b, statement.range()));
378
379        let result = graph.validate(a);
380        assert_eq!(result[0].file, d);
381        assert_eq!(result[0].range, 0..18);
382    }
383
384    #[test]
385    fn skip_cycle_in_build() {
386        let parse = ParseContext::parse(
387            "a".into(),
388            None,
389            Box::new(|path: &Path| match path.to_str().unwrap() {
390                "a" => Ok("include(bb);".into()),
391                "bb" => Ok("include(a);".into()),
392                _ => Err(SourceLoadError::new(
393                    path.to_owned(),
394                    std::io::Error::new(std::io::ErrorKind::NotFound, "oh no"),
395                )),
396            }),
397        )
398        .unwrap();
399        let (resolved, errs) = parse.generate_parse_tree();
400        assert_eq!(errs.len(), 1);
401        assert_eq!(resolved.root.text_len(), "include(bb);".len());
402    }
403
404    #[test]
405    fn assembly_basic() {
406        let file_a = "\
407        include(b);\n\
408        # hmm\n\
409        include(c);";
410        let file_b = "languagesystem dflt DFLT;\n";
411        let file_c = "feature kern {\n pos a b 20;\n } kern;";
412
413        let b_len = file_b.len();
414        let c_len = file_c.len();
415
416        let parse = ParseContext::parse(
417            "file_a".into(),
418            None,
419            Box::new(|path: &Path| match path.to_str().unwrap() {
420                "file_a" => Ok(file_a.into()),
421                "b" => Ok(file_b.into()),
422                "c" => Ok(file_c.into()),
423                _ => Err(SourceLoadError::new(
424                    path.into(),
425                    std::io::Error::new(std::io::ErrorKind::NotFound, "oh no"),
426                )),
427            }),
428        )
429        .unwrap();
430
431        let a_id = parse.sources.id_for_path("file_a").unwrap();
432        let b_id = parse.sources.id_for_path("b").unwrap();
433        let c_id = parse.sources.id_for_path("c").unwrap();
434
435        let (resolved, errs) = parse.generate_parse_tree();
436        assert!(errs.is_empty(), "{errs:?}");
437        let top_level_nodes = resolved
438            .root
439            .iter_children()
440            .filter_map(|n| n.as_node())
441            .collect::<Vec<_>>();
442        let inter_node_len = "\n# hmm\n".len();
443        assert_eq!(top_level_nodes.len(), 2);
444        assert_eq!(top_level_nodes[0].kind(), Kind::LanguageSystemNode);
445        assert_eq!(top_level_nodes[0].range(), 0..b_len - 1); // ignore newline
446        let node_2_start = b_len + inter_node_len;
447        assert_eq!(
448            top_level_nodes[1].range(),
449            node_2_start..node_2_start + c_len,
450        );
451        assert_eq!(top_level_nodes[1].kind(), Kind::FeatureNode);
452
453        //resolved.root.debug_print_structure(true);
454        assert_eq!(resolved.map.resolve_range(10..15), (b_id, 10..15));
455        assert_eq!(resolved.map.resolve_range(29..33), (a_id, 14..18));
456        assert_eq!(resolved.map.resolve_range(49..52), (c_id, 16..19));
457    }
458}