fea_rs/
token_tree.rs

1use std::{fmt::Write, io::Write as _};
2
3use std::{ops::Range, sync::Arc};
4
5use smol_str::SmolStr;
6
7use crate::parse::{FileId, IncludeStatement};
8use crate::{diagnostic::Diagnostic, GlyphMap, Level};
9
10use self::cursor::Cursor;
11use typed::AstNode as _;
12
13mod cursor;
14mod edit;
15mod rewrite;
16mod stack;
17mod token;
18pub mod typed;
19
20use rewrite::ReparseCtx;
21pub use token::Kind;
22
23/// A node in the token tree.
24///
25/// A node is tagged with a `Kind`, and includes any number of child nodes or tokens.
26#[derive(PartialEq, Eq, Clone, PartialOrd, Ord)]
27#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
28pub struct Node {
29    /// The ``Kind` of this node.
30    kind: Kind,
31
32    // NOTE: the absolute position within the tree is not known when the node
33    // is created; this is updated (and correct) only when the node has been
34    // accessed via a `Cursor`.
35    abs_pos: u32,
36    /// The length of the text spanned by this node
37    text_len: u32,
38    /// true if an error was encountered in this node.
39    ///
40    /// This is not recursive; it is only true for the direct parent of an error span.
41    pub error: bool,
42    //NOTE: children should not be accessed directly, but only via a cursor.
43    // this ensures that their positions are updated correctly.
44    children: Arc<Vec<NodeOrToken>>,
45}
46
47/// A token is a chunk of text, tagged with a `Kind`.
48#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)]
49#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
50pub struct Token {
51    /// The [`Kind`] of this token
52    pub kind: Kind,
53    /// The absolute position in the source where this token starts
54    abs_pos: u32,
55    /// The token text
56    pub text: SmolStr,
57}
58
59/// Either a [`Node`] or a [`Token`].
60#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
61#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
62pub enum NodeOrToken {
63    /// A node
64    Node(Node),
65    /// A token
66    Token(Token),
67}
68
69#[derive(Clone, Debug, Default)]
70pub(crate) struct TreeBuilder {
71    //TODO: reuse tokens
72    //token_cache: HashMap<Arc<Token>>,
73    // the kind of the parent, and the index in children of the first child.
74    parents: Vec<(Kind, usize)>,
75    children: Vec<NodeOrToken>,
76}
77
78/// Consumes tokens during parsing, building up an AST.
79pub(crate) struct AstSink<'a> {
80    text: &'a str,
81    file_id: FileId,
82    text_pos: usize,
83    builder: TreeBuilder,
84    // reuseable buffer for reparsing
85    reparse_buf: Vec<NodeOrToken>,
86    glyph_map: Option<&'a GlyphMap>,
87    errors: Vec<Diagnostic>,
88    include_statement_count: usize,
89    cur_node_contains_error: bool,
90}
91
92//NOTE: the inner type is option because we reuse this in the `typed` module,
93//and there we want to be able to easily return an empty iterator when iter() is
94//called on a token.
95//
96/// An iterator over the children of a node.
97#[derive(Default)]
98pub struct ChildIter<'a>(Option<Cursor<'a>>);
99
100impl<'a> AstSink<'a> {
101    pub fn new(text: &'a str, file_id: FileId, glyph_map: Option<&'a GlyphMap>) -> Self {
102        AstSink {
103            file_id,
104            text,
105            text_pos: 0,
106            builder: TreeBuilder::default(),
107            glyph_map,
108            errors: Vec::new(),
109            cur_node_contains_error: false,
110            include_statement_count: 0,
111            reparse_buf: Default::default(),
112        }
113    }
114
115    pub(crate) fn token(&mut self, kind: Kind, len: usize) {
116        let token_text = &self.text[self.text_pos..self.text_pos + len];
117        let to_add = self.validate_token(kind, token_text);
118        self.builder.push_raw(to_add);
119        self.text_pos += len;
120    }
121
122    pub(crate) fn start_node(&mut self, kind: Kind) {
123        self.builder.start_node(kind);
124    }
125
126    pub(crate) fn finish_node(&mut self, kind: Option<Kind>) {
127        let cur_kind = kind
128            .or_else(|| self.builder.parents.last().map(|x| x.0))
129            .unwrap();
130        let kind = self.maybe_rewrite_current_node(cur_kind).or(kind);
131        self.builder.finish_node(self.cur_node_contains_error, kind);
132        self.cur_node_contains_error = false;
133        // if this is an include statement we store a copy.
134        if self.builder.children.last().map(|n| n.kind()) == Some(Kind::IncludeNode) {
135            self.include_statement_count += 1;
136        }
137    }
138
139    pub(crate) fn current_node_has_error(&self) -> bool {
140        self.cur_node_contains_error
141    }
142
143    pub(crate) fn error(&mut self, mut error: Diagnostic) {
144        let is_hard_error = error.level == Level::Error;
145        error.message.file = self.file_id;
146        self.errors.push(error);
147        self.cur_node_contains_error = is_hard_error;
148    }
149
150    pub fn finish(self) -> (Node, Vec<Diagnostic>, Vec<IncludeStatement>) {
151        let mut node = self.builder.finish();
152        node.update_positions_from_root();
153        let mut includes = Vec::new();
154        if self.include_statement_count > 0 {
155            node.find_include_nodes(&mut includes, self.include_statement_count);
156        }
157        (node, self.errors, includes)
158    }
159
160    #[cfg(test)]
161    pub fn errors(&self) -> &[Diagnostic] {
162        &self.errors
163    }
164
165    /// called before adding a token.
166    ///
167    /// We can perform additional validation here. Currently it is mostly for
168    /// disambiguating glyph names that might be ranges.
169    fn validate_token(&mut self, kind: Kind, text: &str) -> NodeOrToken {
170        if kind == Kind::GlyphNameOrRange {
171            if let Some(map) = self.glyph_map {
172                if map.contains(text) {
173                    return Token::new(Kind::GlyphName, text.into()).into();
174                }
175                match try_split_range(text, map) {
176                    Ok(node) => return node.into(),
177                    Err(message) => {
178                        let range = self.text_pos..self.text_pos + text.len();
179                        self.error(Diagnostic::error(FileId::CURRENT_FILE, range, message));
180                    }
181                }
182            }
183        }
184        Token::new(kind, text.into()).into()
185    }
186
187    /// Called before finishing a node.
188    ///
189    /// This is an opportunity for us to rewrite this node's tree, which is
190    /// something that we do for rules with contextual glyphs.
191    ///
192    /// This lets us provide much better information about the rule to the
193    /// rest of the compilation pipeline.
194    ///
195    /// The reason that we have to do this after the first pass is because
196    /// determining whether or not a rule is contextual requires arbitrary
197    /// lookahead at the parser level. Instead of writing an arbitrary lookahead
198    /// parser, we instead rescan the children after parsing, grouping them
199    /// into things like backtrack/context/lookahead sequences.
200    fn maybe_rewrite_current_node(&mut self, cur_kind: Kind) -> Option<Kind> {
201        match cur_kind {
202            _ if self.cur_node_contains_error => None,
203            Kind::GsubNodeNeedsRewrite => {
204                Some(self.rewrite_current_node(rewrite::reparse_contextual_sub_rule))
205            }
206            Kind::GposNodeNeedsRewrite => {
207                Some(self.rewrite_current_node(rewrite::reparse_contextual_pos_rule))
208            }
209            _ => None,
210        }
211    }
212
213    fn rewrite_current_node(&mut self, rewrite_fn: impl FnOnce(&mut ReparseCtx) -> Kind) -> Kind {
214        assert!(self.reparse_buf.is_empty());
215        self.builder.move_current_children(&mut self.reparse_buf);
216        // temporarily take the buffer to satisfy borrowck
217        let mut buf = std::mem::take(&mut self.reparse_buf);
218        let items_start_offset: usize = buf.iter().map(NodeOrToken::text_len).sum();
219
220        let mut reparse_ctx = ReparseCtx {
221            in_buf: &mut buf,
222            text_pos: self.text_pos - items_start_offset,
223            sink: self,
224        };
225        let new_kind = rewrite_fn(&mut reparse_ctx);
226        // put back the buffer so we can reuse the storage next time
227        assert!(
228            reparse_ctx.in_buf.is_empty(),
229            "rewrite finished with unhandled items"
230        );
231        buf.clear();
232        std::mem::swap(&mut self.reparse_buf, &mut buf);
233        new_kind
234    }
235
236    fn push_raw(&mut self, n: NodeOrToken) {
237        self.builder.push_raw(n);
238    }
239}
240
241impl Node {
242    fn new(kind: Kind, children: Vec<NodeOrToken>, error: bool) -> Self {
243        let text_len = children.iter().map(|x| x.text_len() as u32).sum();
244        Node {
245            kind,
246            text_len,
247            abs_pos: 0,
248            children: children.into(),
249            error,
250        }
251    }
252
253    /// recursively compute and update the positions of each child.
254    ///
255    /// This should only be called on a root node; it assumes the position
256    /// of the callee is `0`.
257    ///
258    /// This is required in order for us to correctly associate diagnostics
259    /// with their locations in the source.
260    pub(crate) fn update_positions_from_root(&mut self) {
261        self.update_positions_recurse(0)
262    }
263
264    fn update_positions_recurse(&mut self, mut pos: usize) {
265        self.abs_pos = pos as _;
266        let children = Arc::make_mut(&mut self.children);
267
268        for child in children {
269            child.update_positions(pos);
270            pos += child.text_len();
271        }
272    }
273
274    /// Construct a new cursor for navigating the node's children
275    pub(crate) fn cursor(&self) -> Cursor {
276        Cursor::new(self)
277    }
278
279    /// Iterate over tokens, descending into child nodes.
280    pub fn iter_tokens(&self) -> impl Iterator<Item = &Token> {
281        let mut cursor = self.cursor();
282        std::iter::from_fn(move || cursor.next_token())
283    }
284
285    /// Iterate over this node's direct children, without descending.
286    pub fn iter_children(&self) -> ChildIter {
287        ChildIter(Some(self.cursor()))
288    }
289
290    /// The `Kind` of the node
291    pub fn kind(&self) -> Kind {
292        self.kind
293    }
294
295    /// The total length of all tokens that are descendents of this node.
296    pub fn text_len(&self) -> usize {
297        self.text_len as usize
298    }
299
300    /// The range in the original source of this node.
301    ///
302    /// Only correct if this node is accessed via a cursor.
303    pub fn range(&self) -> Range<usize> {
304        let start = self.abs_pos as usize;
305        start..start + (self.text_len as usize)
306    }
307
308    /// Create a new tree, replacing the provided ranges with the provided
309    /// nodes.
310    ///
311    /// if skip_parent is true, children of inserted nodes are added directly,
312    /// without their parent nodes. (we use this when resolving includes
313    ///
314    /// range start/end just fall on token boundaries.
315    pub(crate) fn edit(&self, edits: Vec<(Range<usize>, Node)>, skip_parent: bool) -> Node {
316        edit::apply_edits(self, edits, skip_parent)
317    }
318
319    fn find_include_nodes(&self, collect: &mut Vec<IncludeStatement>, num: usize) {
320        for item in self.iter_children() {
321            if let Some(node) = item.as_node() {
322                if let Some(include) = typed::Include::cast(item) {
323                    collect.push(IncludeStatement(include));
324                    if collect.len() == num {
325                        return;
326                    }
327                } else {
328                    node.find_include_nodes(collect, num);
329                }
330            }
331        }
332    }
333
334    #[doc(hidden)]
335    // used in some tests for debugging
336    pub fn debug_print_structure(&self, include_tokens: bool) {
337        let mut cursor = self.cursor();
338        while let Some(thing) = cursor.current() {
339            match thing {
340                NodeOrToken::Node(node) => {
341                    let depth = cursor.depth();
342                    let _ = writeln!(
343                        std::io::stderr(),
344                        "{}{} ({}..{})",
345                        &crate::util::SPACES[..depth * 2],
346                        node.kind,
347                        cursor.pos(),
348                        cursor.pos() + node.text_len()
349                    );
350                }
351                NodeOrToken::Token(t) if include_tokens => eprint!("{}", t.as_str()),
352                _ => (),
353            }
354            cursor.advance();
355        }
356    }
357
358    #[doc(hidden)]
359    pub fn simple_parse_tree(&self) -> String {
360        let mut result = String::new();
361        self.parse_tree_impl(0, &mut result).unwrap();
362        result
363    }
364
365    fn parse_tree_impl(&self, depth: usize, buf: &mut String) -> std::fmt::Result {
366        use crate::util::SPACES;
367        let mut pos = self.abs_pos;
368        writeln!(
369            buf,
370            "{}{}@[{}; {})",
371            &SPACES[..depth * 2],
372            self.kind,
373            pos,
374            pos + self.text_len
375        )?;
376        let depth = depth + 1;
377        for child in self.iter_children() {
378            match child {
379                NodeOrToken::Token(Token { kind, text, .. }) => {
380                    let spaces = &SPACES[..depth * 2];
381                    write!(buf, "{}{}@{}", spaces, kind, pos)?;
382                    if kind.is_trivia() {
383                        writeln!(buf, " \"{}\"", text.escape_debug())?;
384                    } else {
385                        writeln!(buf, " \"{}\"", text)?;
386                    }
387                    pos += text.len() as u32;
388                }
389                NodeOrToken::Node(node) => {
390                    node.parse_tree_impl(depth + 1, buf)?;
391                    pos += node.text_len;
392                }
393            }
394        }
395        Ok(())
396    }
397}
398
399impl<'a> Iterator for ChildIter<'a> {
400    type Item = &'a NodeOrToken;
401
402    fn next(&mut self) -> Option<Self::Item> {
403        let current = self.0.as_ref()?.current();
404        self.0.as_mut()?.step_over();
405        current
406    }
407}
408
409impl TreeBuilder {
410    pub(crate) fn start_node(&mut self, kind: Kind) {
411        let len = self.children.len();
412        self.parents.push((kind, len));
413    }
414
415    pub(crate) fn token(&mut self, kind: Kind, text: impl Into<SmolStr>) {
416        let token = Token::new(kind, text.into());
417        self.push_raw(token.into());
418    }
419
420    fn push_raw(&mut self, item: NodeOrToken) {
421        self.children.push(item)
422    }
423
424    /// copy the children of the currently open node into a buffer.
425    ///
426    /// This is only used as part of reparsing.
427    fn move_current_children(&mut self, to_buf: &mut Vec<NodeOrToken>) {
428        if let Some(idx) = self.parents.last().map(|(_, idx)| idx).copied() {
429            to_buf.extend(self.children.drain(idx..));
430        }
431    }
432
433    pub(crate) fn finish_node(&mut self, error: bool, new_kind: Option<Kind>) {
434        let (kind, first_child) = self.parents.pop().unwrap();
435        let kind = new_kind.unwrap_or(kind);
436        let node = Node::new(kind, self.children.split_off(first_child), error);
437        self.push_raw(node.into());
438    }
439
440    pub(crate) fn finish(mut self) -> Node {
441        assert_eq!(self.children.len(), 1);
442        self.children.pop().unwrap().into_node().unwrap()
443    }
444}
445
446impl NodeOrToken {
447    fn update_positions(&mut self, pos: usize) {
448        match self {
449            NodeOrToken::Token(t) => t.abs_pos = pos as _,
450            NodeOrToken::Node(n) => n.update_positions_recurse(pos),
451        }
452    }
453
454    /// `true` if this is a single token.
455    pub fn is_token(&self) -> bool {
456        matches!(self, NodeOrToken::Token(_))
457    }
458
459    /// If this is a single token, return that token's text
460    pub fn token_text(&self) -> Option<&str> {
461        self.as_token().map(Token::as_str)
462    }
463
464    /// The `Kind` of this node or token
465    pub fn kind(&self) -> Kind {
466        match self {
467            NodeOrToken::Node(n) => n.kind,
468            NodeOrToken::Token(t) => t.kind,
469        }
470    }
471
472    /// `true` If this is a glyph name, a CID, or a glyph class (either inline or named)
473    pub fn is_glyph_or_glyph_class(&self) -> bool {
474        matches!(
475            self.kind(),
476            Kind::GlyphName | Kind::Cid | Kind::GlyphClass | Kind::NamedGlyphClass
477        )
478    }
479
480    /// The range in the source text of this node or token.
481    ///
482    /// Note: this is only accurate if the token was accessed via a cursor.
483    pub fn range(&self) -> Range<usize> {
484        match self {
485            NodeOrToken::Token(t) => t.range(),
486            NodeOrToken::Node(n) => n.range(),
487        }
488    }
489
490    /// The length of this token or node's text
491    pub fn text_len(&self) -> usize {
492        match self {
493            NodeOrToken::Node(n) => n.text_len as usize,
494            NodeOrToken::Token(t) => t.text.len(),
495        }
496    }
497
498    /// If this is a `Node`, return it
499    pub fn into_node(self) -> Option<Node> {
500        match self {
501            NodeOrToken::Node(node) => Some(node),
502            NodeOrToken::Token(_) => None,
503        }
504    }
505
506    /// If this is a `Node`, return a reference to it
507    pub fn as_node(&self) -> Option<&Node> {
508        match self {
509            NodeOrToken::Node(node) => Some(node),
510            NodeOrToken::Token(_) => None,
511        }
512    }
513
514    /// IF this is a token, return a reference to it.
515    pub fn as_token(&self) -> Option<&Token> {
516        match self {
517            NodeOrToken::Node(_) => None,
518            NodeOrToken::Token(token) => Some(token),
519        }
520    }
521}
522
523impl From<Node> for NodeOrToken {
524    fn from(src: Node) -> NodeOrToken {
525        NodeOrToken::Node(src)
526    }
527}
528
529impl From<Token> for NodeOrToken {
530    fn from(src: Token) -> NodeOrToken {
531        NodeOrToken::Token(src)
532    }
533}
534
535impl Token {
536    fn new(kind: Kind, text: SmolStr) -> Self {
537        Token {
538            kind,
539            text,
540            abs_pos: 0,
541        }
542    }
543
544    /// The raw text for this token
545    pub fn as_str(&self) -> &str {
546        &self.text
547    }
548
549    /// The position of this token in its source.
550    pub fn range(&self) -> Range<usize> {
551        self.abs_pos as usize..self.abs_pos as usize + self.text.len()
552    }
553}
554
555/// try to split a glyph containing hyphens into a glyph range.
556fn try_split_range(text: &str, glyph_map: &GlyphMap) -> Result<Node, String> {
557    let mut solution = None;
558
559    // we try all possible split points
560    for idx in text
561        .bytes()
562        .enumerate()
563        .filter_map(|(idx, b)| (b == b'-').then_some(idx))
564    {
565        let (head, tail) = text.split_at(idx);
566        if glyph_map.contains(head) && glyph_map.contains(tail.trim_start_matches('-')) {
567            if let Some(prev_idx) = solution.replace(idx) {
568                let (head1, tail1) = text.split_at(prev_idx);
569                let (head2, tail2) = text.split_at(idx);
570                let message = format!("the name '{}' contains multiple possible glyph ranges ({} to {} and {} to {}). Please insert spaces around the '-' to clarify your intent.", text, head1, tail1.trim_end_matches('-'), head2, tail2.trim_end_matches('-'));
571                return Err(message);
572            }
573        }
574    }
575
576    // if we have a solution, generate a new node
577    solution
578        .map(|idx| {
579            let mut builder = TreeBuilder::default();
580            builder.start_node(Kind::GlyphRange);
581            let (head, tail) = text.split_at(idx);
582            builder.token(Kind::GlyphName, head);
583            builder.token(Kind::Hyphen, "-");
584            builder.token(Kind::GlyphName, tail.trim_start_matches('-'));
585            builder.finish_node(false, None);
586            builder.finish()
587        })
588        .ok_or_else(|| {
589            format!(
590                "'{}' is neither a known glyph or a range of known glyphs",
591                text
592            )
593        })
594}
595
596impl Node {
597    fn debug_impl(&self, f: &mut std::fmt::Formatter, depth: usize) -> std::fmt::Result {
598        use crate::util::SPACES;
599
600        let ws = &SPACES[..depth * 2];
601        write!(
602            f,
603            "\n{ws}{}:  abs {} len {} children {}",
604            self.kind,
605            self.abs_pos,
606            self.text_len,
607            self.children.len()
608        )?;
609        let ws = &SPACES[..(depth + 1) * 2];
610        for child in self.iter_children() {
611            match child {
612                NodeOrToken::Token(t) => write!(f, "\n{}'{}' {}", ws, t.text, t.kind)?,
613                NodeOrToken::Node(n) => n.debug_impl(f, depth + 1)?,
614            }
615        }
616        Ok(())
617    }
618}
619
620impl std::fmt::Debug for Node {
621    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
622        self.debug_impl(f, 0)
623    }
624}
625
626#[cfg(test)]
627mod tests {
628
629    use super::*;
630    static SAMPLE_FEA: &str = include_str!("../test-data/fonttools-tests/mini.fea");
631
632    #[test]
633    fn token_iter() {
634        let (ast, _errs) = crate::parse::parse_string(SAMPLE_FEA);
635        let reconstruct = ast
636            .root()
637            .iter_tokens()
638            .map(Token::as_str)
639            .collect::<String>();
640        crate::assert_eq_str!(SAMPLE_FEA, reconstruct);
641    }
642}