Skip to main content

tree_house_bindings/
parser.rs

1use std::cell::Cell;
2use std::os::raw::c_void;
3use std::panic::{catch_unwind, AssertUnwindSafe};
4use std::ptr::NonNull;
5use std::time::Duration;
6use std::{fmt, mem, ptr};
7
8use regex_cursor::Cursor;
9
10use crate::grammar::IncompatibleGrammarError;
11use crate::tree::{SyntaxTreeData, Tree};
12use crate::{Grammar, Input, IntoInput, Point, Range};
13
14// opaque data
15enum ParserData {}
16
17#[clippy::msrv = "1.76.0"]
18thread_local! {
19    static PARSER_CACHE: Cell<Option<RawParser>> = const { Cell::new(None) };
20}
21
22struct RawParser {
23    ptr: NonNull<ParserData>,
24}
25
26impl Drop for RawParser {
27    fn drop(&mut self) {
28        unsafe { ts_parser_delete(self.ptr) }
29    }
30}
31
32/// A stateful object that this is used to produce a [`Tree`] based on some
33/// source code.
34pub struct Parser {
35    ptr: NonNull<ParserData>,
36}
37
38impl Parser {
39    /// Create a new parser.
40    #[must_use]
41    pub fn new() -> Parser {
42        let ptr = match PARSER_CACHE.take() {
43            Some(cached) => {
44                let ptr = cached.ptr;
45                mem::forget(cached);
46                ptr
47            }
48            None => unsafe { ts_parser_new() },
49        };
50        Parser { ptr }
51    }
52
53    /// Set the language that the parser should use for parsing.
54    pub fn set_grammar(&mut self, grammar: Grammar) -> Result<(), IncompatibleGrammarError> {
55        if unsafe { ts_parser_set_language(self.ptr, grammar) } {
56            Ok(())
57        } else {
58            Err(IncompatibleGrammarError {
59                abi_version: grammar.abi_version(),
60            })
61        }
62    }
63
64    pub fn set_timeout(&mut self, duration: Duration) {
65        #[allow(deprecated)]
66        unsafe {
67            ts_parser_set_timeout_micros(self.ptr, duration.as_micros().try_into().unwrap());
68        }
69    }
70
71    /// Set the ranges of text that the parser should include when parsing. By default, the parser
72    /// will always include entire documents. This function allows you to parse only a *portion*
73    /// of a document but still return a syntax tree whose ranges match up with the document as a
74    /// whole. You can also pass multiple disjoint ranges.
75    ///
76    /// `ranges` must be non-overlapping and sorted.
77    pub fn set_included_ranges(&mut self, ranges: &[Range]) -> Result<(), InvalidRangesError> {
78        // TODO: save some memory by only storing byte ranges and converting them to TS ranges in an
79        // internal buffer here. Points are not used by TS. Alternatively we can patch the TS C code
80        // to accept a simple pair (struct with two fields) of byte positions here instead of a full
81        // tree sitter range
82        let success = unsafe {
83            ts_parser_set_included_ranges(self.ptr, ranges.as_ptr(), ranges.len() as u32)
84        };
85        if success {
86            Ok(())
87        } else {
88            Err(InvalidRangesError)
89        }
90    }
91
92    #[must_use]
93    pub fn parse<I: Input>(
94        &mut self,
95        input: impl IntoInput<Input = I>,
96        old_tree: Option<&Tree>,
97    ) -> Option<Tree> {
98        let mut input = input.into_input();
99        unsafe extern "C" fn read<C: Input>(
100            payload: NonNull<c_void>,
101            byte_index: u32,
102            _position: Point,
103            bytes_read: *mut u32,
104        ) -> *const u8 {
105            let cursor = catch_unwind(AssertUnwindSafe(move || {
106                let input: &mut C = payload.cast().as_mut();
107                let cursor = input.cursor_at(byte_index);
108                let slice = cursor.chunk();
109                let offset: u32 = cursor.offset().try_into().unwrap();
110                let len: u32 = slice.len().try_into().unwrap();
111                (byte_index - offset, slice.as_ptr(), len)
112            }));
113            match cursor {
114                Ok((chunk_offset, ptr, len)) if chunk_offset < len => {
115                    *bytes_read = len - chunk_offset;
116                    ptr.add(chunk_offset as usize)
117                }
118                _ => {
119                    *bytes_read = 0;
120                    ptr::null()
121                }
122            }
123        }
124        let input = ParserInputRaw {
125            payload: NonNull::from(&mut input).cast(),
126            read: read::<I>,
127            encoding: InputEncoding::Utf8,
128            decode: None,
129        };
130
131        unsafe {
132            let old_tree = old_tree.map(|tree| tree.as_raw());
133            let new_tree = ts_parser_parse(self.ptr, old_tree, input);
134            new_tree.map(|raw| Tree::from_raw(raw))
135        }
136    }
137}
138
139impl Default for Parser {
140    fn default() -> Self {
141        Self::new()
142    }
143}
144
145unsafe impl Sync for Parser {}
146unsafe impl Send for Parser {}
147
148impl Drop for Parser {
149    fn drop(&mut self) {
150        PARSER_CACHE.set(Some(RawParser { ptr: self.ptr }));
151    }
152}
153
154/// An error that occurred when trying to assign an incompatible [`Grammar`] to
155/// a [`Parser`].
156#[derive(Debug, PartialEq, Eq)]
157pub struct InvalidRangesError;
158
159impl fmt::Display for InvalidRangesError {
160    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
161        write!(f, "include ranges overlap or are not sorted",)
162    }
163}
164impl std::error::Error for InvalidRangesError {}
165
166type TreeSitterReadFn = unsafe extern "C" fn(
167    payload: NonNull<c_void>,
168    byte_index: u32,
169    position: Point,
170    bytes_read: *mut u32,
171) -> *const u8;
172
173/// A function that reads one code point from the given string, returning the number of bytes
174/// consumed.
175type DecodeInputFn =
176    unsafe extern "C" fn(string: *const u8, length: u32, code_point: *const i32) -> u32;
177
178#[repr(C)]
179#[derive(Debug)]
180pub struct ParserInputRaw {
181    pub payload: NonNull<c_void>,
182    pub read: TreeSitterReadFn,
183    pub encoding: InputEncoding,
184    /// A function to decode the the input.
185    ///
186    /// This function is only used if the encoding is `InputEncoding::Custom`.
187    pub decode: Option<DecodeInputFn>,
188}
189
190// `TSInputEncoding`
191#[repr(u32)]
192#[derive(Debug, Clone, Copy)]
193pub enum InputEncoding {
194    Utf8,
195    Utf16LE,
196    Utf16BE,
197    Custom,
198}
199
200#[allow(unused)]
201#[repr(C)]
202#[derive(Debug)]
203struct ParseState {
204    /// The payload passed via `ParseOptions`' `payload` field.
205    payload: NonNull<c_void>,
206    current_byte_offset: u32,
207    has_error: bool,
208}
209
210/// A function that accepts the current parser state and returns `true` when the parse should be
211/// cancelled.
212#[allow(unused)]
213type ProgressCallback = unsafe extern "C" fn(state: NonNull<ParseState>) -> bool;
214
215#[allow(unused)]
216#[repr(C)]
217#[derive(Debug, Default)]
218struct ParseOptions {
219    payload: Option<NonNull<c_void>>,
220    progress_callback: Option<ProgressCallback>,
221}
222
223extern "C" {
224    /// Create a new parser
225    fn ts_parser_new() -> NonNull<ParserData>;
226    /// Delete the parser, freeing all of the memory that it used.
227    fn ts_parser_delete(parser: NonNull<ParserData>);
228    /// Set the language that the parser should use for parsing. Returns a boolean indicating
229    /// whether or not the language was successfully assigned. True means assignment
230    /// succeeded. False means there was a version mismatch: the language was generated with
231    /// an incompatible version of the Tree-sitter CLI. Check the language's version using
232    /// `ts_language_version` and compare it to this library's `TREE_SITTER_LANGUAGE_VERSION`
233    /// and `TREE_SITTER_MIN_COMPATIBLE_LANGUAGE_VERSION` constants.
234    fn ts_parser_set_language(parser: NonNull<ParserData>, language: Grammar) -> bool;
235    /// Set the ranges of text that the parser should include when parsing. By default, the parser
236    /// will always include entire documents. This function allows you to parse only a *portion*
237    /// of a document but still return a syntax tree whose ranges match up with the document as a
238    /// whole. You can also pass multiple disjoint ranges. The second and third parameters specify
239    /// the location and length of an array of ranges. The parser does *not* take ownership of
240    /// these ranges; it copies the data, so it doesn't matter how these ranges are allocated.
241    /// If `count` is zero, then the entire document will be parsed. Otherwise, the given ranges
242    /// must be ordered from earliest to latest in the document, and they must not overlap. That
243    /// is, the following must hold for all: `i < count - 1`: `ranges[i].end_byte <= ranges[i +
244    /// 1].start_byte` If this requirement is not satisfied, the operation will fail, the ranges
245    /// will not be assigned, and this function will return `false`. On success, this function
246    /// returns `true`
247    fn ts_parser_set_included_ranges(
248        parser: NonNull<ParserData>,
249        ranges: *const Range,
250        count: u32,
251    ) -> bool;
252
253    fn ts_parser_parse(
254        parser: NonNull<ParserData>,
255        old_tree: Option<NonNull<SyntaxTreeData>>,
256        input: ParserInputRaw,
257    ) -> Option<NonNull<SyntaxTreeData>>;
258
259    /// Set the maximum duration in microseconds that parsing should be allowed to
260    /// take before halting.
261    ///
262    /// If parsing takes longer than this, it will halt early, returning NULL.
263    /// See [`ts_parser_parse`] for more information.
264    #[deprecated = "use ts_parser_parse_with_options and pass in a calback instead, this will be removed in 0.26"]
265    fn ts_parser_set_timeout_micros(self_: NonNull<ParserData>, timeout_micros: u64);
266
267    /// Use the parser to parse some source code and create a syntax tree, with some options.
268    ///
269    /// See `ts_parser_parse` for more details.
270    ///
271    /// See `TSParseOptions` for more details on the options.
272    #[allow(unused)]
273    fn ts_parser_parse_with_options(
274        parser: NonNull<ParserData>,
275        old_tree: Option<NonNull<SyntaxTreeData>>,
276        input: ParserInputRaw,
277        parse_options: ParseOptions,
278    ) -> Option<NonNull<SyntaxTreeData>>;
279}