fea_rs/
token_tree.rs

1use std::{fmt::Write, io::Write as _};
2
3use std::{ops::Range, sync::Arc};
4
5use smol_str::SmolStr;
6
7use crate::parse::{FileId, IncludeStatement};
8use crate::{GlyphMap, Level, diagnostic::Diagnostic};
9
10use self::cursor::Cursor;
11use typed::AstNode as _;
12
13mod cursor;
14mod edit;
15mod rewrite;
16mod stack;
17mod token;
18pub mod typed;
19
20use rewrite::ReparseCtx;
21pub use token::Kind;
22
23/// A node in the token tree.
24///
25/// A node is tagged with a `Kind`, and includes any number of child nodes or tokens.
26#[derive(PartialEq, Eq, Clone, PartialOrd, Ord)]
27#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
28pub struct Node {
29    /// The ``Kind` of this node.
30    kind: Kind,
31
32    // NOTE: the absolute position within the tree is not known when the node
33    // is created; this is updated (and correct) only when the node has been
34    // accessed via a `Cursor`.
35    abs_pos: u32,
36    /// The length of the text spanned by this node
37    text_len: u32,
38    /// true if an error was encountered in this node.
39    ///
40    /// This is not recursive; it is only true for the direct parent of an error span.
41    pub error: bool,
42    //NOTE: children should not be accessed directly, but only via a cursor.
43    // this ensures that their positions are updated correctly.
44    children: Arc<Vec<NodeOrToken>>,
45}
46
47/// A token is a chunk of text, tagged with a `Kind`.
48#[derive(Debug, PartialEq, Eq, Clone, PartialOrd, Ord)]
49#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
50pub struct Token {
51    /// The [`Kind`] of this token
52    pub kind: Kind,
53    /// The absolute position in the source where this token starts
54    abs_pos: u32,
55    /// The token text
56    pub text: SmolStr,
57}
58
59/// Either a [`Node`] or a [`Token`].
60#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
61#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
62pub enum NodeOrToken {
63    /// A node
64    Node(Node),
65    /// A token
66    Token(Token),
67}
68
69#[derive(Clone, Debug, Default)]
70pub(crate) struct TreeBuilder {
71    //TODO: reuse tokens
72    //token_cache: HashMap<Arc<Token>>,
73    // the kind of the parent, and the index in children of the first child.
74    parents: Vec<(Kind, usize)>,
75    children: Vec<NodeOrToken>,
76}
77
78/// Consumes tokens during parsing, building up an AST.
79pub(crate) struct AstSink<'a> {
80    text: &'a str,
81    file_id: FileId,
82    text_pos: usize,
83    builder: TreeBuilder,
84    // reuseable buffer for reparsing
85    reparse_buf: Vec<NodeOrToken>,
86    glyph_map: Option<&'a GlyphMap>,
87    errors: Vec<Diagnostic>,
88    include_statement_count: usize,
89    cur_node_contains_error: bool,
90}
91
92//NOTE: the inner type is option because we reuse this in the `typed` module,
93//and there we want to be able to easily return an empty iterator when iter() is
94//called on a token.
95//
96/// An iterator over the children of a node.
97#[derive(Default)]
98pub struct ChildIter<'a>(Option<Cursor<'a>>);
99
100impl<'a> AstSink<'a> {
101    pub fn new(text: &'a str, file_id: FileId, glyph_map: Option<&'a GlyphMap>) -> Self {
102        AstSink {
103            file_id,
104            text,
105            text_pos: 0,
106            builder: TreeBuilder::default(),
107            glyph_map,
108            errors: Vec::new(),
109            cur_node_contains_error: false,
110            include_statement_count: 0,
111            reparse_buf: Default::default(),
112        }
113    }
114
115    pub(crate) fn token(&mut self, kind: Kind, len: usize) {
116        let token_text = &self.text[self.text_pos..self.text_pos + len];
117        let to_add = self.validate_token(kind, token_text);
118        self.builder.push_raw(to_add);
119        self.text_pos += len;
120    }
121
122    pub(crate) fn start_node(&mut self, kind: Kind) {
123        self.builder.start_node(kind);
124    }
125
126    pub(crate) fn finish_node(&mut self, kind: Option<Kind>) {
127        let cur_kind = kind
128            .or_else(|| self.builder.parents.last().map(|x| x.0))
129            .unwrap();
130        let kind = self.maybe_rewrite_current_node(cur_kind).or(kind);
131        self.builder.finish_node(self.cur_node_contains_error, kind);
132        self.cur_node_contains_error = false;
133        // if this is an include statement we store a copy.
134        if self.builder.children.last().map(|n| n.kind()) == Some(Kind::IncludeNode) {
135            self.include_statement_count += 1;
136        }
137    }
138
139    pub(crate) fn current_node_has_error(&self) -> bool {
140        self.cur_node_contains_error
141    }
142
143    pub(crate) fn error(&mut self, mut error: Diagnostic) {
144        let is_hard_error = error.level == Level::Error;
145        error.message.file = self.file_id;
146        self.errors.push(error);
147        self.cur_node_contains_error = is_hard_error;
148    }
149
150    pub fn finish(self) -> (Node, Vec<Diagnostic>, Vec<IncludeStatement>) {
151        let mut node = self.builder.finish();
152        node.update_positions_from_root();
153        let mut includes = Vec::new();
154        if self.include_statement_count > 0 {
155            node.find_include_nodes(&mut includes, self.include_statement_count);
156        }
157        (node, self.errors, includes)
158    }
159
160    #[cfg(test)]
161    pub fn errors(&self) -> &[Diagnostic] {
162        &self.errors
163    }
164
165    /// called before adding a token.
166    ///
167    /// We can perform additional validation here. Currently it is mostly for
168    /// disambiguating glyph names that might be ranges.
169    fn validate_token(&mut self, kind: Kind, text: &str) -> NodeOrToken {
170        if kind == Kind::GlyphNameOrRange
171            && let Some(map) = self.glyph_map
172        {
173            if map.contains(text) {
174                return Token::new(Kind::GlyphName, text.into()).into();
175            }
176            match try_split_range(text, map) {
177                Ok(node) => return node.into(),
178                Err(message) => {
179                    let range = self.text_pos..self.text_pos + text.len();
180                    self.error(Diagnostic::error(FileId::CURRENT_FILE, range, message));
181                }
182            }
183        }
184        Token::new(kind, text.into()).into()
185    }
186
187    /// Called before finishing a node.
188    ///
189    /// This is an opportunity for us to rewrite this node's tree, which is
190    /// something that we do for rules with contextual glyphs.
191    ///
192    /// This lets us provide much better information about the rule to the
193    /// rest of the compilation pipeline.
194    ///
195    /// The reason that we have to do this after the first pass is because
196    /// determining whether or not a rule is contextual requires arbitrary
197    /// lookahead at the parser level. Instead of writing an arbitrary lookahead
198    /// parser, we instead rescan the children after parsing, grouping them
199    /// into things like backtrack/context/lookahead sequences.
200    fn maybe_rewrite_current_node(&mut self, cur_kind: Kind) -> Option<Kind> {
201        match cur_kind {
202            _ if self.cur_node_contains_error => None,
203            Kind::GsubNodeNeedsRewrite => {
204                Some(self.rewrite_current_node(rewrite::reparse_contextual_sub_rule))
205            }
206            Kind::GposNodeNeedsRewrite => {
207                Some(self.rewrite_current_node(rewrite::reparse_contextual_pos_rule))
208            }
209            _ => None,
210        }
211    }
212
213    fn rewrite_current_node(&mut self, rewrite_fn: impl FnOnce(&mut ReparseCtx) -> Kind) -> Kind {
214        assert!(self.reparse_buf.is_empty());
215        self.builder.move_current_children(&mut self.reparse_buf);
216        // temporarily take the buffer to satisfy borrowck
217        let mut buf = std::mem::take(&mut self.reparse_buf);
218        let items_start_offset: usize = buf.iter().map(NodeOrToken::text_len).sum();
219
220        let mut reparse_ctx = ReparseCtx {
221            in_buf: &mut buf,
222            text_pos: self.text_pos - items_start_offset,
223            sink: self,
224        };
225        let new_kind = rewrite_fn(&mut reparse_ctx);
226        // put back the buffer so we can reuse the storage next time
227        assert!(
228            reparse_ctx.in_buf.is_empty(),
229            "rewrite finished with unhandled items"
230        );
231        buf.clear();
232        std::mem::swap(&mut self.reparse_buf, &mut buf);
233        new_kind
234    }
235
236    fn push_raw(&mut self, n: NodeOrToken) {
237        self.builder.push_raw(n);
238    }
239}
240
241impl Node {
242    fn new(kind: Kind, children: Vec<NodeOrToken>, error: bool) -> Self {
243        let text_len = children.iter().map(|x| x.text_len() as u32).sum();
244        Node {
245            kind,
246            text_len,
247            abs_pos: 0,
248            children: children.into(),
249            error,
250        }
251    }
252
253    /// recursively compute and update the positions of each child.
254    ///
255    /// This should only be called on a root node; it assumes the position
256    /// of the callee is `0`.
257    ///
258    /// This is required in order for us to correctly associate diagnostics
259    /// with their locations in the source.
260    pub(crate) fn update_positions_from_root(&mut self) {
261        self.update_positions_recurse(0)
262    }
263
264    fn update_positions_recurse(&mut self, mut pos: usize) {
265        self.abs_pos = pos as _;
266        let children = Arc::make_mut(&mut self.children);
267
268        for child in children {
269            child.update_positions(pos);
270            pos += child.text_len();
271        }
272    }
273
274    /// Construct a new cursor for navigating the node's children
275    pub(crate) fn cursor(&self) -> Cursor<'_> {
276        Cursor::new(self)
277    }
278
279    /// Iterate over tokens, descending into child nodes.
280    pub fn iter_tokens(&self) -> impl Iterator<Item = &Token> {
281        let mut cursor = self.cursor();
282        std::iter::from_fn(move || cursor.next_token())
283    }
284
285    /// Iterate over this node's direct children, without descending.
286    pub fn iter_children(&self) -> ChildIter<'_> {
287        ChildIter(Some(self.cursor()))
288    }
289
290    /// The `Kind` of the node
291    pub fn kind(&self) -> Kind {
292        self.kind
293    }
294
295    /// The total length of all tokens that are descendents of this node.
296    pub fn text_len(&self) -> usize {
297        self.text_len as usize
298    }
299
300    /// The range in the original source of this node.
301    ///
302    /// Only correct if this node is accessed via a cursor.
303    pub fn range(&self) -> Range<usize> {
304        let start = self.abs_pos as usize;
305        start..start + (self.text_len as usize)
306    }
307
308    /// Create a new tree, replacing the provided ranges with the provided
309    /// nodes.
310    ///
311    /// if skip_parent is true, children of inserted nodes are added directly,
312    /// without their parent nodes. (we use this when resolving includes
313    ///
314    /// range start/end just fall on token boundaries.
315    pub(crate) fn edit(&self, edits: Vec<(Range<usize>, Node)>, skip_parent: bool) -> Node {
316        edit::apply_edits(self, edits, skip_parent)
317    }
318
319    fn find_include_nodes(&self, collect: &mut Vec<IncludeStatement>, num: usize) {
320        for item in self.iter_children() {
321            if let Some(node) = item.as_node() {
322                if let Some(include) = typed::Include::cast(item) {
323                    collect.push(IncludeStatement {
324                        stmt: include,
325                        scope: self.kind,
326                    });
327                    if collect.len() == num {
328                        return;
329                    }
330                } else {
331                    node.find_include_nodes(collect, num);
332                }
333            }
334        }
335    }
336
337    #[doc(hidden)]
338    // used in some tests for debugging
339    pub fn debug_print_structure(&self, include_tokens: bool) {
340        let mut cursor = self.cursor();
341        while let Some(thing) = cursor.current() {
342            match thing {
343                NodeOrToken::Node(node) => {
344                    let depth = cursor.depth();
345                    let _ = writeln!(
346                        std::io::stderr(),
347                        "{}{} ({}..{})",
348                        &crate::util::SPACES[..depth * 2],
349                        node.kind,
350                        cursor.pos(),
351                        cursor.pos() + node.text_len()
352                    );
353                }
354                NodeOrToken::Token(t) if include_tokens => eprint!("{}", t.as_str()),
355                _ => (),
356            }
357            cursor.advance();
358        }
359    }
360
361    #[doc(hidden)]
362    pub fn simple_parse_tree(&self) -> String {
363        let mut result = String::new();
364        self.parse_tree_impl(0, &mut result).unwrap();
365        result
366    }
367
368    fn parse_tree_impl(&self, depth: usize, buf: &mut String) -> std::fmt::Result {
369        use crate::util::SPACES;
370        let mut pos = self.abs_pos;
371        writeln!(
372            buf,
373            "{}{}@[{}; {})",
374            &SPACES[..depth * 2],
375            self.kind,
376            pos,
377            pos + self.text_len
378        )?;
379        let depth = depth + 1;
380        for child in self.iter_children() {
381            match child {
382                NodeOrToken::Token(Token { kind, text, .. }) => {
383                    let spaces = &SPACES[..depth * 2];
384                    write!(buf, "{spaces}{kind}@{pos}")?;
385                    if kind.is_trivia() {
386                        writeln!(buf, " \"{}\"", text.escape_debug())?;
387                    } else {
388                        writeln!(buf, " \"{text}\"")?;
389                    }
390                    pos += text.len() as u32;
391                }
392                NodeOrToken::Node(node) => {
393                    node.parse_tree_impl(depth + 1, buf)?;
394                    pos += node.text_len;
395                }
396            }
397        }
398        Ok(())
399    }
400}
401
402impl<'a> Iterator for ChildIter<'a> {
403    type Item = &'a NodeOrToken;
404
405    fn next(&mut self) -> Option<Self::Item> {
406        let current = self.0.as_ref()?.current();
407        self.0.as_mut()?.step_over();
408        current
409    }
410}
411
412impl TreeBuilder {
413    pub(crate) fn start_node(&mut self, kind: Kind) {
414        let len = self.children.len();
415        self.parents.push((kind, len));
416    }
417
418    pub(crate) fn token(&mut self, kind: Kind, text: impl Into<SmolStr>) {
419        let token = Token::new(kind, text.into());
420        self.push_raw(token.into());
421    }
422
423    fn push_raw(&mut self, item: NodeOrToken) {
424        self.children.push(item)
425    }
426
427    /// copy the children of the currently open node into a buffer.
428    ///
429    /// This is only used as part of reparsing.
430    fn move_current_children(&mut self, to_buf: &mut Vec<NodeOrToken>) {
431        if let Some(idx) = self.parents.last().map(|(_, idx)| idx).copied() {
432            to_buf.extend(self.children.drain(idx..));
433        }
434    }
435
436    pub(crate) fn finish_node(&mut self, error: bool, new_kind: Option<Kind>) {
437        let (kind, first_child) = self.parents.pop().unwrap();
438        let kind = new_kind.unwrap_or(kind);
439        let node = Node::new(kind, self.children.split_off(first_child), error);
440        self.push_raw(node.into());
441    }
442
443    pub(crate) fn finish(mut self) -> Node {
444        assert_eq!(self.children.len(), 1);
445        self.children.pop().unwrap().into_node().unwrap()
446    }
447}
448
449impl NodeOrToken {
450    fn update_positions(&mut self, pos: usize) {
451        match self {
452            NodeOrToken::Token(t) => t.abs_pos = pos as _,
453            NodeOrToken::Node(n) => n.update_positions_recurse(pos),
454        }
455    }
456
457    /// `true` if this is a single token.
458    pub fn is_token(&self) -> bool {
459        matches!(self, NodeOrToken::Token(_))
460    }
461
462    /// If this is a single token, return that token's text
463    pub fn token_text(&self) -> Option<&str> {
464        self.as_token().map(Token::as_str)
465    }
466
467    /// The `Kind` of this node or token
468    pub fn kind(&self) -> Kind {
469        match self {
470            NodeOrToken::Node(n) => n.kind,
471            NodeOrToken::Token(t) => t.kind,
472        }
473    }
474
475    /// `true` If this is a glyph name, a CID, or a glyph class (either inline or named)
476    pub fn is_glyph_or_glyph_class(&self) -> bool {
477        matches!(
478            self.kind(),
479            Kind::GlyphName | Kind::Cid | Kind::GlyphClass | Kind::NamedGlyphClass
480        )
481    }
482
483    /// The range in the source text of this node or token.
484    ///
485    /// Note: this is only accurate if the token was accessed via a cursor.
486    pub fn range(&self) -> Range<usize> {
487        match self {
488            NodeOrToken::Token(t) => t.range(),
489            NodeOrToken::Node(n) => n.range(),
490        }
491    }
492
493    /// The length of this token or node's text
494    pub fn text_len(&self) -> usize {
495        match self {
496            NodeOrToken::Node(n) => n.text_len as usize,
497            NodeOrToken::Token(t) => t.text.len(),
498        }
499    }
500
501    /// If this is a `Node`, return it
502    pub fn into_node(self) -> Option<Node> {
503        match self {
504            NodeOrToken::Node(node) => Some(node),
505            NodeOrToken::Token(_) => None,
506        }
507    }
508
509    /// If this is a `Node`, return a reference to it
510    pub fn as_node(&self) -> Option<&Node> {
511        match self {
512            NodeOrToken::Node(node) => Some(node),
513            NodeOrToken::Token(_) => None,
514        }
515    }
516
517    /// IF this is a token, return a reference to it.
518    pub fn as_token(&self) -> Option<&Token> {
519        match self {
520            NodeOrToken::Node(_) => None,
521            NodeOrToken::Token(token) => Some(token),
522        }
523    }
524}
525
526impl From<Node> for NodeOrToken {
527    fn from(src: Node) -> NodeOrToken {
528        NodeOrToken::Node(src)
529    }
530}
531
532impl From<Token> for NodeOrToken {
533    fn from(src: Token) -> NodeOrToken {
534        NodeOrToken::Token(src)
535    }
536}
537
538impl Token {
539    fn new(kind: Kind, text: SmolStr) -> Self {
540        Token {
541            kind,
542            text,
543            abs_pos: 0,
544        }
545    }
546
547    /// The raw text for this token
548    pub fn as_str(&self) -> &str {
549        &self.text
550    }
551
552    /// The position of this token in its source.
553    pub fn range(&self) -> Range<usize> {
554        self.abs_pos as usize..self.abs_pos as usize + self.text.len()
555    }
556}
557
558/// try to split a glyph containing hyphens into a glyph range.
559fn try_split_range(text: &str, glyph_map: &GlyphMap) -> Result<Node, String> {
560    let mut solution = None;
561
562    // we try all possible split points
563    for idx in text
564        .bytes()
565        .enumerate()
566        .filter_map(|(idx, b)| (b == b'-').then_some(idx))
567    {
568        let (head, tail) = text.split_at(idx);
569        if glyph_map.contains(head)
570            && glyph_map.contains(tail.trim_start_matches('-'))
571            && let Some(prev_idx) = solution.replace(idx)
572        {
573            let (head1, tail1) = text.split_at(prev_idx);
574            let (head2, tail2) = text.split_at(idx);
575            let message = format!(
576                "the name '{}' contains multiple possible glyph ranges ({} to {} and {} to {}). Please insert spaces around the '-' to clarify your intent.",
577                text,
578                head1,
579                tail1.trim_end_matches('-'),
580                head2,
581                tail2.trim_end_matches('-')
582            );
583            return Err(message);
584        }
585    }
586
587    // if we have a solution, generate a new node
588    solution
589        .map(|idx| {
590            let mut builder = TreeBuilder::default();
591            builder.start_node(Kind::GlyphRange);
592            let (head, tail) = text.split_at(idx);
593            builder.token(Kind::GlyphName, head);
594            builder.token(Kind::Hyphen, "-");
595            builder.token(Kind::GlyphName, tail.trim_start_matches('-'));
596            builder.finish_node(false, None);
597            builder.finish()
598        })
599        .ok_or_else(|| format!("'{text}' is neither a known glyph or a range of known glyphs",))
600}
601
602impl Node {
603    fn debug_impl(&self, f: &mut std::fmt::Formatter, depth: usize) -> std::fmt::Result {
604        use crate::util::SPACES;
605
606        let ws = &SPACES[..depth * 2];
607        write!(
608            f,
609            "\n{ws}{}:  abs {} len {} children {}",
610            self.kind,
611            self.abs_pos,
612            self.text_len,
613            self.children.len()
614        )?;
615        let ws = &SPACES[..(depth + 1) * 2];
616        for child in self.iter_children() {
617            match child {
618                NodeOrToken::Token(t) => write!(f, "\n{}'{}' {}", ws, t.text, t.kind)?,
619                NodeOrToken::Node(n) => n.debug_impl(f, depth + 1)?,
620            }
621        }
622        Ok(())
623    }
624}
625
626impl std::fmt::Debug for Node {
627    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
628        self.debug_impl(f, 0)
629    }
630}
631
632#[cfg(test)]
633mod tests {
634
635    use super::*;
636    static SAMPLE_FEA: &str = include_str!("../test-data/fonttools-tests/mini.fea");
637
638    #[test]
639    fn token_iter() {
640        let (ast, _errs) = crate::parse::parse_string(SAMPLE_FEA);
641        let reconstruct = ast
642            .root()
643            .iter_tokens()
644            .map(Token::as_str)
645            .collect::<String>();
646        crate::assert_eq_str!(SAMPLE_FEA, reconstruct);
647    }
648}