Skip to main content

thread_ast_engine/
source.rs

1// SPDX-FileCopyrightText: 2022 Herrington Darkholme <2883231+HerringtonDarkholme@users.noreply.github.com>
2// SPDX-FileCopyrightText: 2025 Knitli Inc. <knitli@knit.li>
3// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
4//
5// SPDX-License-Identifier: AGPL-3.0-or-later AND MIT
6
7//! # Document and Content Abstraction
8//!
9//! Core traits for abstracting source code documents and their encoding across different platforms.
10//!
11//! ## Multi-Platform Support
12//!
13//! thread-ast-engine supports multiple text encodings to work across different environments:
14//! - **UTF-8** - Standard for CLI applications and most Rust code
15//! - **UTF-16** - Required for Node.js NAPI bindings
16//! - **`Vec<char>`** - Used in WASM environments for JavaScript interop
17//!
18//! Different encodings affect how byte positions and ranges are calculated in tree-sitter nodes,
19//! so this abstraction ensures consistent behavior across platforms.
20//!
21//! ## Key Concepts
22//!
23//! ### Documents ([`Doc`])
24//! Represents a complete source code document with its language and parsing information.
25//! Provides methods to access the source text, perform edits, and get AST nodes.
26//!
27//! ### Content ([`Content`])
28//! Abstracts the underlying text representation (bytes, UTF-16 code units, etc.).
29//! Handles encoding/decoding operations needed for text manipulation and replacement.
30//!
31//! ### Node Interface ([`SgNode`])
32//! Generic interface for AST nodes that works across different parser backends.
33//! Provides navigation, introspection, and traversal methods.
34//!
35//! ## Example Usage
36//!
37//! ```rust,ignore
38//! // Documents abstract over different source encodings
39//! let doc = StrDoc::new("const x = 42;", Language::JavaScript);
40//! let root = doc.root_node();
41//!
42//! // Content trait handles encoding differences transparently
43//! let source_bytes = doc.get_source().get_range(0..5); // "const"
44//! ```
45
46use crate::{Position, language::Language, node::KindId};
47use std::borrow::Cow;
48use std::ops::Range;
49
50/// Represents an edit operation on source code.
51///
52/// Edits specify where in the source to make changes and what new content
53/// to insert. Used for incremental parsing and code transformation.
54///
55/// # Type Parameters
56///
57/// - `S: Content` - The content type (determines encoding)
58///
59/// # Example
60///
61/// ```rust,ignore
62/// let edit = Edit {
63///     position: 5,           // Start at byte position 5
64///     deleted_length: 3,     // Delete 3 bytes
65///     inserted_text: "new".as_bytes().to_vec(), // Insert "new"
66/// };
67/// ```
68// https://github.com/tree-sitter/tree-sitter/blob/e4e5ffe517ca2c668689b24cb17c51b8c6db0790/cli/src/parse.rs
69#[derive(Debug, Clone)]
70pub struct Edit<S: Content> {
71    /// Byte position where the edit starts
72    pub position: usize,
73    /// Number of bytes to delete from the original content
74    pub deleted_length: usize,
75    /// New content to insert (in the content's underlying representation)
76    pub inserted_text: Vec<S::Underlying>,
77}
78
79/// Generic interface for AST nodes across different parser backends.
80///
81/// `SgNode` (`SourceGraph` Node) provides a consistent API for working with
82/// AST nodes regardless of the underlying parser implementation. Supports
83/// navigation, introspection, and traversal operations.
84///
85/// # Lifetime
86///
87/// The lifetime `'r` ties the node to its root document, ensuring memory safety.
88///
89/// # Note
90///
91/// Some method names match tree-sitter's API. Use fully qualified syntax
92/// if there are naming conflicts with tree-sitter imports.
93///
94/// See: <https://stackoverflow.com/a/44445976/2198656>
95pub trait SgNode<'r>: Clone + std::fmt::Debug + Send + Sync {
96    fn parent(&self) -> Option<Self>;
97    fn children(&self) -> impl ExactSizeIterator<Item = Self>;
98    fn kind(&self) -> Cow<'_, str>;
99    fn kind_id(&self) -> KindId;
100    fn node_id(&self) -> usize;
101    fn range(&self) -> std::ops::Range<usize>;
102    fn start_pos(&self) -> Position;
103    fn end_pos(&self) -> Position;
104
105    // default implementation
106    #[allow(clippy::needless_collect)]
107    fn ancestors(&self, _root: Self) -> impl Iterator<Item = Self> {
108        let mut ancestors = vec![];
109        let mut current = self.clone();
110        while let Some(parent) = current.parent() {
111            ancestors.push(parent.clone());
112            current = parent;
113        }
114        ancestors.reverse();
115        ancestors.into_iter()
116    }
117    fn dfs(&self) -> impl Iterator<Item = Self> {
118        let mut stack = vec![self.clone()];
119        std::iter::from_fn(move || {
120            if let Some(node) = stack.pop() {
121                stack.extend(node.children().collect::<Vec<_>>().into_iter().rev());
122                Some(node)
123            } else {
124                None
125            }
126        })
127    }
128    fn child(&self, nth: usize) -> Option<Self> {
129        self.children().nth(nth)
130    }
131    fn next(&self) -> Option<Self> {
132        let parent = self.parent()?;
133        let mut children = parent.children();
134        while let Some(child) = children.next() {
135            if child.node_id() == self.node_id() {
136                return children.next();
137            }
138        }
139        None
140    }
141    fn prev(&self) -> Option<Self> {
142        let parent = self.parent()?;
143        let children = parent.children();
144        let mut prev = None;
145        for child in children {
146            if child.node_id() == self.node_id() {
147                return prev;
148            }
149            prev = Some(child);
150        }
151        None
152    }
153    fn next_all(&self) -> impl Iterator<Item = Self> {
154        let mut next = self.next();
155        std::iter::from_fn(move || {
156            let n = next.clone()?;
157            next = n.next();
158            Some(n)
159        })
160    }
161    fn prev_all(&self) -> impl Iterator<Item = Self> {
162        let mut prev = self.prev();
163        std::iter::from_fn(move || {
164            let n = prev.clone()?;
165            prev = n.prev();
166            Some(n)
167        })
168    }
169    fn is_named(&self) -> bool {
170        true
171    }
172    /// N.B. it is different from `is_named` && `is_leaf`
173    /// if a node has no named children.
174    fn is_named_leaf(&self) -> bool {
175        self.is_leaf()
176    }
177    fn is_leaf(&self) -> bool {
178        self.children().count() == 0
179    }
180
181    // missing node is a tree-sitter specific concept
182    fn is_missing(&self) -> bool {
183        false
184    }
185    fn is_error(&self) -> bool {
186        false
187    }
188
189    fn field(&self, name: &str) -> Option<Self>;
190    fn field_children(&self, field_id: Option<u16>) -> impl Iterator<Item = Self>;
191    fn child_by_field_id(&self, field_id: u16) -> Option<Self>;
192}
193
194/// Represents a source code document with its language and parsed AST.
195///
196/// `Doc` provides the core interface for working with parsed source code documents.
197/// It combines the source text, language information, and AST representation in
198/// a single abstraction that supports editing and node operations.
199///
200/// # Type Parameters
201///
202/// - `Source: Content` - The text representation (String, UTF-16, etc.)
203/// - `Lang: Language` - The programming language implementation
204/// - `Node: SgNode` - The AST node implementation
205///
206/// # Example
207///
208/// ```rust,ignore
209/// // Documents provide access to source, language, and AST
210/// let doc = StrDoc::new("const x = 42;", JavaScript);
211///
212/// // Access different aspects of the document
213/// let source = doc.get_source();  // Get source text
214/// let lang = doc.get_lang();      // Get language info
215/// let root = doc.root_node();     // Get AST root
216///
217/// // Extract text from specific nodes
218/// let node_text = doc.get_node_text(&some_node);
219/// ```
220pub trait Doc: Clone + std::fmt::Debug + Send + Sync + 'static {
221    /// The source code representation (String, UTF-16, etc.)
222    type Source: Content;
223    /// The programming language implementation
224    type Lang: Language;
225    /// The AST node type for this document
226    type Node<'r>: SgNode<'r>;
227
228    /// Get the language implementation for this document
229    fn get_lang(&self) -> &Self::Lang;
230
231    /// Get the source code content
232    fn get_source(&self) -> &Self::Source;
233
234    /// Apply an edit to the document, updating both source and AST
235    fn do_edit(&mut self, edit: &Edit<Self::Source>) -> Result<(), String>;
236
237    /// Get the root AST node
238    fn root_node(&self) -> Self::Node<'_>;
239
240    /// Extract the text content of a specific AST node
241    fn get_node_text<'a>(&'a self, node: &Self::Node<'a>) -> Cow<'a, str>;
242}
243
244/// Abstracts source code text representation across different encodings.
245///
246/// `Content` allows the same AST operations to work with different text encodings
247/// (UTF-8, UTF-16, etc.) by providing encoding/decoding operations and position
248/// calculations. Essential for cross-platform support.
249///
250/// # Type Parameters
251///
252/// - `Underlying` - The basic unit type (u8 for UTF-8, u16 for UTF-16, etc.)
253///
254/// # Example
255///
256/// ```rust,ignore
257/// // Content trait abstracts encoding differences
258/// let content = "Hello, world!";
259/// let bytes = content.get_range(0..5);  // [72, 101, 108, 108, 111] for UTF-8
260/// let column = content.get_char_column(0, 7); // Character position
261/// ```
262pub trait Content: Sized + Send + Sync {
263    /// The underlying data type (u8, u16, char, etc.)
264    type Underlying: Clone + PartialEq + std::fmt::Debug + Send + Sync;
265
266    /// Get a slice of the underlying data for the given byte range
267    fn get_range(&self, range: Range<usize>) -> &[Self::Underlying];
268
269    /// Convert a string to this content's underlying representation.
270    ///
271    /// Used during text replacement to ensure proper encoding.
272    fn decode_str(src: &str) -> Cow<'_, [Self::Underlying]>;
273
274    /// Convert underlying data back to a string.
275    ///
276    /// Used to extract text content after transformations.
277    fn encode_bytes(bytes: &[Self::Underlying]) -> Cow<'_, str>;
278
279    /// Calculate the character column position at a given byte offset.
280    ///
281    /// Handles Unicode properly by computing actual character positions
282    /// rather than byte positions.
283    fn get_char_column(&self, column: usize, offset: usize) -> usize;
284}
285
286impl Content for String {
287    type Underlying = u8;
288    fn get_range(&self, range: Range<usize>) -> &[Self::Underlying] {
289        &self.as_bytes()[range]
290    }
291    fn decode_str(src: &str) -> Cow<'_, [Self::Underlying]> {
292        Cow::Borrowed(src.as_bytes())
293    }
294    fn encode_bytes(bytes: &[Self::Underlying]) -> Cow<'_, str> {
295        Self::from_utf8_lossy(bytes)
296    }
297
298    /// This is an O(n) operation optimized with SIMD. SIMD allows efficient processing
299    /// of unusually long lines. Modest improvements for standard code lines (~100 chars)
300    fn get_char_column(&self, _col: usize, offset: usize) -> usize {
301        // Use SIMD-optimized version from utils crate
302        thread_utilities::get_char_column_simd(self, offset)
303    }
304}