tree_house/
lib.rs

1use locals::Locals;
2use ropey::RopeSlice;
3
4use slab::Slab;
5
6use std::fmt;
7use std::hash::{Hash, Hasher};
8use std::time::Duration;
9use tree_sitter::{IncompatibleGrammarError, Node, Tree};
10
11pub use crate::config::{read_query, LanguageConfig, LanguageLoader};
12pub use crate::injections_query::{InjectionLanguageMarker, InjectionsQuery};
13use crate::parse::LayerUpdateFlags;
14pub use crate::tree_cursor::TreeCursor;
15pub use tree_sitter;
16// pub use pretty_print::pretty_print_tree;
17// pub use tree_cursor::TreeCursor;
18
19mod config;
20pub mod highlighter;
21mod injections_query;
22mod parse;
23#[cfg(all(test, feature = "fixtures"))]
24mod tests;
25// mod pretty_print;
26#[cfg(feature = "fixtures")]
27pub mod fixtures;
28pub mod locals;
29pub mod query_iter;
30pub mod text_object;
31mod tree_cursor;
32
33/// A layer represents a single a single syntax tree that represents (part of)
34/// a file parsed with a tree-sitter grammar. See [`Syntax`].
35#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
36pub struct Layer(u32);
37
38impl Layer {
39    fn idx(self) -> usize {
40        self.0 as usize
41    }
42}
43
44#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
45pub struct Language(pub u32);
46
47impl Language {
48    pub fn new(idx: u32) -> Language {
49        Language(idx)
50    }
51
52    pub fn idx(self) -> usize {
53        self.0 as usize
54    }
55}
56
57/// The Tree sitter syntax tree for a single language.
58///
59/// This is really multiple (nested) different syntax trees due to tree sitter
60/// injections. A single syntax tree/parser is called layer. Each layer
61/// is parsed as a single "file" by tree sitter. There can be multiple layers
62/// for the same language. A layer corresponds to one of three things:
63/// * the root layer
64/// * a singular injection limited to a single node in its parent layer
65/// * Multiple injections (multiple disjoint nodes in parent layer) that are
66///   parsed as though they are a single uninterrupted file.
67///
68/// An injection always refer to a single node into which another layer is
69/// injected. As injections only correspond to syntax tree nodes injections in
70/// the same layer do not intersect. However, the syntax tree in a an injected
71/// layer can have nodes that intersect with nodes from the parent layer. For
72/// example:
73///
74/// ``` no-compile
75/// layer2: | Sibling A |      Sibling B (layer3)     | Sibling C |
76/// layer1: | Sibling A (layer2) | Sibling B | Sibling C (layer2) |
77/// ````
78///
79/// In this case Sibling B really spans across a "GAP" in layer2. While the syntax
80/// node can not be split up by tree sitter directly, we can treat Sibling B as two
81/// separate injections. That is done while parsing/running the query capture. As
82/// a result the injections form a tree. Note that such other queries must account for
83/// such multi injection nodes.
84#[derive(Debug)]
85pub struct Syntax {
86    layers: Slab<LayerData>,
87    root: Layer,
88}
89
90impl Syntax {
91    pub fn new(
92        source: RopeSlice,
93        language: Language,
94        timeout: Duration,
95        loader: &impl LanguageLoader,
96    ) -> Result<Self, Error> {
97        let root_layer = LayerData {
98            parse_tree: None,
99            language,
100            flags: LayerUpdateFlags::default(),
101            ranges: vec![tree_sitter::Range {
102                start_byte: 0,
103                end_byte: u32::MAX,
104                start_point: tree_sitter::Point::ZERO,
105                end_point: tree_sitter::Point::MAX,
106            }],
107            injections: Vec::new(),
108            parent: None,
109            locals: Locals::default(),
110        };
111        let mut layers = Slab::with_capacity(32);
112        let root = layers.insert(root_layer);
113        let mut syntax = Self {
114            root: Layer(root as u32),
115            layers,
116        };
117
118        syntax.update(source, timeout, &[], loader).map(|_| syntax)
119    }
120
121    pub fn layer(&self, layer: Layer) -> &LayerData {
122        &self.layers[layer.idx()]
123    }
124
125    fn layer_mut(&mut self, layer: Layer) -> &mut LayerData {
126        &mut self.layers[layer.idx()]
127    }
128
129    pub fn root(&self) -> Layer {
130        self.root
131    }
132
133    pub fn tree(&self) -> &Tree {
134        self.layer(self.root)
135            .tree()
136            .expect("`Syntax::new` would err if the root layer's tree could not be parsed")
137    }
138
139    #[inline]
140    pub fn tree_for_byte_range(&self, start: u32, end: u32) -> &Tree {
141        self.layer_and_tree_for_byte_range(start, end).1
142    }
143
144    /// Finds the smallest layer which has a parse tree and covers the given range.
145    pub(crate) fn layer_and_tree_for_byte_range(&self, start: u32, end: u32) -> (Layer, &Tree) {
146        let mut layer = self.layer_for_byte_range(start, end);
147        loop {
148            // NOTE: this loop is guaranteed to terminate because the root layer always has a
149            // tree.
150            if let Some(tree) = self.layer(layer).tree() {
151                return (layer, tree);
152            }
153            if let Some(parent) = self.layer(layer).parent {
154                layer = parent;
155            }
156        }
157    }
158
159    #[inline]
160    pub fn named_descendant_for_byte_range(&self, start: u32, end: u32) -> Option<Node<'_>> {
161        self.tree_for_byte_range(start, end)
162            .root_node()
163            .named_descendant_for_byte_range(start, end)
164    }
165
166    #[inline]
167    pub fn descendant_for_byte_range(&self, start: u32, end: u32) -> Option<Node<'_>> {
168        self.tree_for_byte_range(start, end)
169            .root_node()
170            .descendant_for_byte_range(start, end)
171    }
172
173    pub fn layer_for_byte_range(&self, start: u32, end: u32) -> Layer {
174        let mut cursor = self.root;
175        loop {
176            let layer = &self.layers[cursor.idx()];
177            let Some(start_injection) = layer.injection_at_byte_idx(start) else {
178                break;
179            };
180            // +1 because the end is exclusive.
181            let Some(end_injection) = layer.injection_at_byte_idx(end + 1) else {
182                break;
183            };
184            if start_injection.layer == end_injection.layer {
185                cursor = start_injection.layer;
186            } else {
187                break;
188            }
189        }
190        cursor
191    }
192
193    pub fn walk(&self) -> TreeCursor {
194        TreeCursor::new(self)
195    }
196}
197
198#[derive(Debug, Clone)]
199pub struct Injection {
200    pub range: Range,
201    pub layer: Layer,
202    matched_node_range: Range,
203}
204
205#[derive(Debug)]
206pub struct LayerData {
207    pub language: Language,
208    parse_tree: Option<Tree>,
209    ranges: Vec<tree_sitter::Range>,
210    /// a list of **sorted** non-overlapping injection ranges. Note that
211    /// injection ranges are not relative to the start of this layer but the
212    /// start of the root layer
213    injections: Vec<Injection>,
214    /// internal flags used during parsing to track incremental invalidation
215    flags: LayerUpdateFlags,
216    parent: Option<Layer>,
217    locals: Locals,
218}
219
220/// This PartialEq implementation only checks if that
221/// two layers are theoretically identical (meaning they highlight the same text range with the same language).
222/// It does not check whether the layers have the same internal tree-sitter
223/// state.
224impl PartialEq for LayerData {
225    fn eq(&self, other: &Self) -> bool {
226        self.parent == other.parent
227            && self.language == other.language
228            && self.ranges == other.ranges
229    }
230}
231
232/// Hash implementation belongs to PartialEq implementation above.
233/// See its documentation for details.
234impl Hash for LayerData {
235    fn hash<H: Hasher>(&self, state: &mut H) {
236        self.parent.hash(state);
237        self.language.hash(state);
238        self.ranges.hash(state);
239    }
240}
241
242impl LayerData {
243    /// Returns the parsed `Tree` for this layer.
244    ///
245    /// This `Option` will always be `Some` when the `LanguageLoader` passed to `Syntax::new`
246    /// returns `Some` when passed the layer's language in `LanguageLoader::get_config`.
247    pub fn tree(&self) -> Option<&Tree> {
248        self.parse_tree.as_ref()
249    }
250
251    /// Returns the injection range **within this layers** that contains `idx`.
252    /// This function will not descend into nested injections
253    pub fn injection_at_byte_idx(&self, idx: u32) -> Option<&Injection> {
254        self.injections_at_byte_idx(idx)
255            .next()
256            .filter(|injection| injection.range.start <= idx)
257    }
258
259    /// Returns the injection ranges **within this layers** that contain
260    /// `idx` or start after idx. This function will not descend into nested
261    /// injections.
262    pub fn injections_at_byte_idx(&self, idx: u32) -> impl Iterator<Item = &Injection> {
263        let i = self
264            .injections
265            .partition_point(|range| range.range.end < idx);
266        self.injections[i..].iter()
267    }
268}
269
270/// Represents the reason why syntax highlighting failed.
271#[derive(Debug, PartialEq, Eq)]
272pub enum Error {
273    Timeout,
274    ExceededMaximumSize,
275    InvalidRanges,
276    Unknown,
277    NoRootConfig,
278    IncompatibleGrammar(Language, IncompatibleGrammarError),
279}
280
281impl fmt::Display for Error {
282    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
283        match self {
284            Self::Timeout => f.write_str("configured timeout was exceeded"),
285            Self::ExceededMaximumSize => f.write_str("input text exceeds the maximum allowed size"),
286            Self::InvalidRanges => f.write_str("invalid ranges"),
287            Self::Unknown => f.write_str("an unknown error occurred"),
288            Self::NoRootConfig => f.write_str(
289                "`LanguageLoader::get_config` for the root layer language returned `None`",
290            ),
291            Self::IncompatibleGrammar(language, IncompatibleGrammarError { abi_version }) => {
292                write!(
293                    f,
294                    "failed to load grammar for language {language:?} with ABI version {abi_version}"
295                )
296            }
297        }
298    }
299}
300
301/// The maximum number of in-progress matches a TS cursor can consider at once.
302/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`.
303/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here.
304///
305///
306/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually).
307/// However, this causes performance issues for medium to large files.
308/// In Helix, this problem caused tree-sitter motions to take multiple seconds to complete in medium-sized rust files (3k loc).
309///
310///
311/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream
312/// (see <https://github.com/neovim/neovim/issues/14897> and <https://github.com/neovim/neovim/pull/14915>).
313/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance.
314///
315///
316/// Neovim chose 64 for this value somewhat arbitrarily (<https://github.com/neovim/neovim/pull/18397>).
317/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions.
318/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high.
319pub const TREE_SITTER_MATCH_LIMIT: u32 = 256;
320
321// use 32 bit ranges since TS doesn't support files larger than 2GiB anyway
322// and it allows us to save a lot memory/improve cache efficiency
323type Range = std::ops::Range<u32>;