tree_house/
lib.rs

1use locals::Locals;
2use ropey::RopeSlice;
3
4use slab::Slab;
5
6use std::fmt;
7use std::hash::{Hash, Hasher};
8use std::time::Duration;
9use tree_sitter::{IncompatibleGrammarError, Node, Tree};
10
11pub use crate::config::{read_query, LanguageConfig, LanguageLoader};
12pub use crate::injections_query::{InjectionLanguageMarker, InjectionsQuery};
13use crate::parse::LayerUpdateFlags;
14pub use crate::tree_cursor::TreeCursor;
15pub use tree_sitter;
16// pub use pretty_print::pretty_print_tree;
17// pub use tree_cursor::TreeCursor;
18
19mod config;
20pub mod highlighter;
21mod injections_query;
22mod parse;
23#[cfg(all(test, feature = "fixtures"))]
24mod tests;
25// mod pretty_print;
26#[cfg(feature = "fixtures")]
27pub mod fixtures;
28pub mod locals;
29pub mod query_iter;
30pub mod text_object;
31mod tree_cursor;
32
33/// A layer represents a single a single syntax tree that represents (part of)
34/// a file parsed with a tree-sitter grammar. See [`Syntax`].
35#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
36pub struct Layer(u32);
37
38impl Layer {
39    fn idx(self) -> usize {
40        self.0 as usize
41    }
42}
43
44#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
45pub struct Language(pub u32);
46
47impl Language {
48    pub fn new(idx: u32) -> Language {
49        Language(idx)
50    }
51
52    pub fn idx(self) -> usize {
53        self.0 as usize
54    }
55}
56
57/// The Tree sitter syntax tree for a single language.
58///
59/// This is really multiple (nested) different syntax trees due to tree sitter
60/// injections. A single syntax tree/parser is called layer. Each layer
61/// is parsed as a single "file" by tree sitter. There can be multiple layers
62/// for the same language. A layer corresponds to one of three things:
63/// * the root layer
64/// * a singular injection limited to a single node in its parent layer
65/// * Multiple injections (multiple disjoint nodes in parent layer) that are
66///   parsed as though they are a single uninterrupted file.
67///
68/// An injection always refer to a single node into which another layer is
69/// injected. As injections only correspond to syntax tree nodes injections in
70/// the same layer do not intersect. However, the syntax tree in a an injected
71/// layer can have nodes that intersect with nodes from the parent layer. For
72/// example:
73///
74/// ``` no-compile
75/// layer2: | Sibling A |      Sibling B (layer3)     | Sibling C |
76/// layer1: | Sibling A (layer2) | Sibling B | Sibling C (layer2) |
77/// ````
78///
79/// In this case Sibling B really spans across a "GAP" in layer2. While the syntax
80/// node can not be split up by tree sitter directly, we can treat Sibling B as two
81/// separate injections. That is done while parsing/running the query capture. As
82/// a result the injections form a tree. Note that such other queries must account for
83/// such multi injection nodes.
84#[derive(Debug)]
85pub struct Syntax {
86    layers: Slab<LayerData>,
87    root: Layer,
88}
89
90impl Syntax {
91    pub fn new(
92        source: RopeSlice,
93        language: Language,
94        timeout: Duration,
95        loader: &impl LanguageLoader,
96    ) -> Result<Self, Error> {
97        let root_layer = LayerData {
98            parse_tree: None,
99            language,
100            flags: LayerUpdateFlags::default(),
101            ranges: vec![tree_sitter::Range {
102                start_byte: 0,
103                end_byte: u32::MAX,
104                start_point: tree_sitter::Point::ZERO,
105                end_point: tree_sitter::Point::MAX,
106            }],
107            injections: Vec::new(),
108            parent: None,
109            locals: Locals::default(),
110        };
111        let mut layers = Slab::with_capacity(32);
112        let root = layers.insert(root_layer);
113        let mut syntax = Self {
114            root: Layer(root as u32),
115            layers,
116        };
117
118        syntax.update(source, timeout, &[], loader).map(|_| syntax)
119    }
120
121    pub fn layer(&self, layer: Layer) -> &LayerData {
122        &self.layers[layer.idx()]
123    }
124
125    fn layer_mut(&mut self, layer: Layer) -> &mut LayerData {
126        &mut self.layers[layer.idx()]
127    }
128
129    pub fn root(&self) -> Layer {
130        self.root
131    }
132
133    pub fn tree(&self) -> &Tree {
134        self.layer(self.root)
135            .tree()
136            .expect("`Syntax::new` would err if the root layer's tree could not be parsed")
137    }
138
139    #[inline]
140    pub fn tree_for_byte_range(&self, start: u32, end: u32) -> &Tree {
141        self.layer_and_tree_for_byte_range(start, end).1
142    }
143
144    /// Finds the smallest layer which has a parse tree and covers the given range.
145    pub(crate) fn layer_and_tree_for_byte_range(&self, start: u32, end: u32) -> (Layer, &Tree) {
146        let mut layer = self.layer_for_byte_range(start, end);
147        loop {
148            // NOTE: this loop is guaranteed to terminate because the root layer always has a
149            // tree.
150            if let Some(tree) = self.layer(layer).tree() {
151                return (layer, tree);
152            }
153            if let Some(parent) = self.layer(layer).parent {
154                layer = parent;
155            }
156        }
157    }
158
159    #[inline]
160    pub fn named_descendant_for_byte_range(&self, start: u32, end: u32) -> Option<Node<'_>> {
161        self.tree_for_byte_range(start, end)
162            .root_node()
163            .named_descendant_for_byte_range(start, end)
164    }
165
166    #[inline]
167    pub fn descendant_for_byte_range(&self, start: u32, end: u32) -> Option<Node<'_>> {
168        self.tree_for_byte_range(start, end)
169            .root_node()
170            .descendant_for_byte_range(start, end)
171    }
172
173    /// Finds the smallest injection layer that fully includes the range `start..=end`.
174    pub fn layer_for_byte_range(&self, start: u32, end: u32) -> Layer {
175        self.layers_for_byte_range(start, end)
176            .last()
177            .expect("always includes the root layer")
178    }
179
180    /// Returns an iterator of layers which **fully include** the byte range `start..=end`,
181    /// in decreasing order based on the size of each layer.
182    ///
183    /// The first layer is always the `root` layer.
184    pub fn layers_for_byte_range(&self, start: u32, end: u32) -> impl Iterator<Item = Layer> + '_ {
185        let mut parent_injection_layer = self.root;
186
187        std::iter::once(self.root).chain(std::iter::from_fn(move || {
188            let layer = &self.layers[parent_injection_layer.idx()];
189
190            let injection_at_start = layer.injection_at_byte_idx(start)?;
191
192            // +1 because the end is exclusive.
193            let injection_at_end = layer.injection_at_byte_idx(end + 1)?;
194
195            (injection_at_start.layer == injection_at_end.layer).then(|| {
196                parent_injection_layer = injection_at_start.layer;
197
198                injection_at_start.layer
199            })
200        }))
201    }
202
203    pub fn walk(&self) -> TreeCursor {
204        TreeCursor::new(self)
205    }
206}
207
208#[derive(Debug, Clone)]
209pub struct Injection {
210    pub range: Range,
211    pub layer: Layer,
212    matched_node_range: Range,
213}
214
215#[derive(Debug)]
216pub struct LayerData {
217    pub language: Language,
218    parse_tree: Option<Tree>,
219    ranges: Vec<tree_sitter::Range>,
220    /// a list of **sorted** non-overlapping injection ranges. Note that
221    /// injection ranges are not relative to the start of this layer but the
222    /// start of the root layer
223    injections: Vec<Injection>,
224    /// internal flags used during parsing to track incremental invalidation
225    flags: LayerUpdateFlags,
226    parent: Option<Layer>,
227    locals: Locals,
228}
229
230/// This PartialEq implementation only checks if that
231/// two layers are theoretically identical (meaning they highlight the same text range with the same language).
232/// It does not check whether the layers have the same internal tree-sitter
233/// state.
234impl PartialEq for LayerData {
235    fn eq(&self, other: &Self) -> bool {
236        self.parent == other.parent
237            && self.language == other.language
238            && self.ranges == other.ranges
239    }
240}
241
242/// Hash implementation belongs to PartialEq implementation above.
243/// See its documentation for details.
244impl Hash for LayerData {
245    fn hash<H: Hasher>(&self, state: &mut H) {
246        self.parent.hash(state);
247        self.language.hash(state);
248        self.ranges.hash(state);
249    }
250}
251
252impl LayerData {
253    /// Returns the parsed `Tree` for this layer.
254    ///
255    /// This `Option` will always be `Some` when the `LanguageLoader` passed to `Syntax::new`
256    /// returns `Some` when passed the layer's language in `LanguageLoader::get_config`.
257    pub fn tree(&self) -> Option<&Tree> {
258        self.parse_tree.as_ref()
259    }
260
261    /// Returns the injection range **within this layers** that contains `idx`.
262    /// This function will not descend into nested injections
263    pub fn injection_at_byte_idx(&self, idx: u32) -> Option<&Injection> {
264        self.injections_at_byte_idx(idx)
265            .next()
266            .filter(|injection| injection.range.start <= idx)
267    }
268
269    /// Returns the injection ranges **within this layers** that contain
270    /// `idx` or start after idx. This function will not descend into nested
271    /// injections.
272    pub fn injections_at_byte_idx(&self, idx: u32) -> impl Iterator<Item = &Injection> {
273        let i = self
274            .injections
275            .partition_point(|range| range.range.end < idx);
276        self.injections[i..].iter()
277    }
278}
279
280/// Represents the reason why syntax highlighting failed.
281#[derive(Debug, PartialEq, Eq)]
282pub enum Error {
283    Timeout,
284    ExceededMaximumSize,
285    InvalidRanges,
286    Unknown,
287    NoRootConfig,
288    IncompatibleGrammar(Language, IncompatibleGrammarError),
289}
290
291impl fmt::Display for Error {
292    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
293        match self {
294            Self::Timeout => f.write_str("configured timeout was exceeded"),
295            Self::ExceededMaximumSize => f.write_str("input text exceeds the maximum allowed size"),
296            Self::InvalidRanges => f.write_str("invalid ranges"),
297            Self::Unknown => f.write_str("an unknown error occurred"),
298            Self::NoRootConfig => f.write_str(
299                "`LanguageLoader::get_config` for the root layer language returned `None`",
300            ),
301            Self::IncompatibleGrammar(language, IncompatibleGrammarError { abi_version }) => {
302                write!(
303                    f,
304                    "failed to load grammar for language {language:?} with ABI version {abi_version}"
305                )
306            }
307        }
308    }
309}
310
311/// The maximum number of in-progress matches a TS cursor can consider at once.
312/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`.
313/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here.
314///
315///
316/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually).
317/// However, this causes performance issues for medium to large files.
318/// In Helix, this problem caused tree-sitter motions to take multiple seconds to complete in medium-sized rust files (3k loc).
319///
320///
321/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream
322/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>).
323/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance.
324///
325///
326/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>).
327/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions.
328/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high.
329pub const TREE_SITTER_MATCH_LIMIT: u32 = 256;
330
331// use 32 bit ranges since TS doesn't support files larger than 2GiB anyway
332// and it allows us to save a lot memory/improve cache efficiency
333type Range = std::ops::Range<u32>;