ast_grep_core/
source.rs

1//! This module defines the `Doc` and `Content` traits to abstract away source code encoding issues.
2//!
3//! ast-grep supports three kinds of encoding: utf-8 for CLI, utf-16 for nodeJS napi and `Vec<char>` for wasm.
4//! Different encoding will produce different tree-sitter Node's range and position.
5//!
6//! The `Content` trait is defined to abstract different encoding.
7//! It is used as associated type bound `Source` in the `Doc` trait.
8//! Its associated type `Underlying`  represents the underlying type of the content, e.g. `Vec<u8>`, `Vec<u16>`.
9//!
10//! `Doc` is a trait that defines a document that can be parsed by Tree-sitter.
11//! It has a `Source` associated type bounded by `Content` that represents the source code of the document,
12//! and a `Lang` associated type that represents the language of the document.
13
14use crate::language::Language;
15use std::borrow::Cow;
16use std::ops::Range;
17use thiserror::Error;
18use tree_sitter::{
19  InputEdit, Language as TsLang, LanguageError, Node, Parser, ParserError, Point, Tree,
20};
21
22#[inline]
23fn parse_lang(
24  parse_fn: impl Fn(&mut Parser) -> Result<Option<Tree>, ParserError>,
25  ts_lang: TsLang,
26) -> Result<Tree, TSParseError> {
27  let mut parser = Parser::new()?;
28  parser.set_language(&ts_lang)?;
29  if let Some(tree) = parse_fn(&mut parser)? {
30    Ok(tree)
31  } else {
32    Err(TSParseError::TreeUnavailable)
33  }
34}
35
36// https://github.com/tree-sitter/tree-sitter/blob/e4e5ffe517ca2c668689b24cb17c51b8c6db0790/cli/src/parse.rs
37#[derive(Debug)]
38pub struct Edit<S: Content> {
39  pub position: usize,
40  pub deleted_length: usize,
41  pub inserted_text: Vec<S::Underlying>,
42}
43
44fn position_for_offset(input: &[u8], offset: usize) -> Point {
45  debug_assert!(offset <= input.len());
46  let (mut row, mut col) = (0, 0);
47  for c in &input[0..offset] {
48    if *c as char == '\n' {
49      row += 1;
50      col = 0;
51    } else {
52      col += 1;
53    }
54  }
55  Point::new(row, col)
56}
57
58pub fn perform_edit<S: Content>(tree: &mut Tree, input: &mut S, edit: &Edit<S>) -> InputEdit {
59  let edit = input.accept_edit(edit);
60  tree.edit(&edit);
61  edit
62}
63
64/// Represents tree-sitter related error
65#[derive(Debug, Error)]
66pub enum TSParseError {
67  #[error("web-tree-sitter parser is not available")]
68  Parse(#[from] ParserError),
69  #[error("incompatible `Language` is assigned to a `Parser`.")]
70  Language(#[from] LanguageError),
71  /// A general error when tree sitter fails to parse in time. It can be caused by
72  /// the following reasons but tree-sitter does not provide error detail.
73  /// * The timeout set with [Parser::set_timeout_micros] expired
74  /// * The cancellation flag set with [Parser::set_cancellation_flag] was flipped
75  /// * The parser has not yet had a language assigned with [Parser::set_language]
76  #[error("general error when tree-sitter fails to parse.")]
77  TreeUnavailable,
78}
79
80pub trait Doc: Clone {
81  type Source: Content;
82  type Lang: Language;
83  fn get_lang(&self) -> &Self::Lang;
84  fn get_source(&self) -> &Self::Source;
85  fn get_source_mut(&mut self) -> &mut Self::Source;
86  fn parse(&self, old_tree: Option<&Tree>) -> Result<Tree, TSParseError> {
87    let source = self.get_source();
88    let lang = self.get_lang().get_ts_language();
89    parse_lang(|p| source.parse_tree_sitter(p, old_tree), lang)
90  }
91  fn clone_with_lang(&self, lang: Self::Lang) -> Self;
92  /// TODO: are we paying too much to support str as Pattern/Replacer??
93  /// this method converts string to Doc, so that we can support using
94  /// string as replacer/searcher. Natively.
95  fn from_str(src: &str, lang: Self::Lang) -> Self;
96}
97
98#[derive(Clone)]
99pub struct StrDoc<L: Language> {
100  pub src: String,
101  pub lang: L,
102}
103
104impl<L: Language> StrDoc<L> {
105  pub fn new(src: &str, lang: L) -> Self {
106    Self {
107      src: src.into(),
108      lang,
109    }
110  }
111}
112
113impl<L: Language> Doc for StrDoc<L> {
114  type Source = String;
115  type Lang = L;
116  fn get_lang(&self) -> &Self::Lang {
117    &self.lang
118  }
119  fn get_source(&self) -> &Self::Source {
120    &self.src
121  }
122  fn get_source_mut(&mut self) -> &mut Self::Source {
123    &mut self.src
124  }
125  fn from_str(src: &str, lang: L) -> Self {
126    Self::new(src, lang)
127  }
128  fn clone_with_lang(&self, lang: Self::Lang) -> Self {
129    Self::new(&self.src, lang)
130  }
131}
132
133pub trait Content: Sized {
134  type Underlying: Clone + PartialEq;
135  fn parse_tree_sitter(
136    &self,
137    parser: &mut Parser,
138    tree: Option<&Tree>,
139  ) -> Result<Option<Tree>, ParserError>;
140  fn get_range(&self, range: Range<usize>) -> &[Self::Underlying];
141  fn accept_edit(&mut self, edit: &Edit<Self>) -> InputEdit;
142  fn get_text<'a>(&'a self, node: &Node) -> Cow<'a, str>;
143  /// Used for string replacement. We need this for
144  /// indentation and deindentation.
145  fn decode_str(src: &str) -> Cow<[Self::Underlying]>;
146  /// Used for string replacement. We need this for
147  /// transformation.
148  fn encode_bytes(bytes: &[Self::Underlying]) -> Cow<str>;
149  /// Get the character column at the given position
150  fn get_char_column(&self, column: usize, offset: usize) -> usize;
151}
152
153impl Content for String {
154  type Underlying = u8;
155  fn parse_tree_sitter(
156    &self,
157    parser: &mut Parser,
158    tree: Option<&Tree>,
159  ) -> Result<Option<Tree>, ParserError> {
160    parser.parse(self.as_bytes(), tree)
161  }
162  fn get_range(&self, range: Range<usize>) -> &[Self::Underlying] {
163    &self.as_bytes()[range]
164  }
165  fn get_text<'a>(&'a self, node: &Node) -> Cow<'a, str> {
166    node
167      .utf8_text(self.as_bytes())
168      .expect("invalid source text encoding")
169  }
170  fn accept_edit(&mut self, edit: &Edit<Self>) -> InputEdit {
171    let start_byte = edit.position;
172    let old_end_byte = edit.position + edit.deleted_length;
173    let new_end_byte = edit.position + edit.inserted_text.len();
174    let input = unsafe { self.as_mut_vec() };
175    let start_position = position_for_offset(input, start_byte);
176    let old_end_position = position_for_offset(input, old_end_byte);
177    input.splice(start_byte..old_end_byte, edit.inserted_text.clone());
178    let new_end_position = position_for_offset(input, new_end_byte);
179    InputEdit::new(
180      start_byte as u32,
181      old_end_byte as u32,
182      new_end_byte as u32,
183      &start_position,
184      &old_end_position,
185      &new_end_position,
186    )
187  }
188  fn decode_str(src: &str) -> Cow<[Self::Underlying]> {
189    Cow::Borrowed(src.as_bytes())
190  }
191  fn encode_bytes(bytes: &[Self::Underlying]) -> Cow<str> {
192    String::from_utf8_lossy(bytes)
193  }
194
195  /// This is an O(n) operation. We assume the col will not be a
196  /// huge number in reality. This may be problematic for special
197  /// files like compressed js
198  fn get_char_column(&self, _col: usize, offset: usize) -> usize {
199    let src = self.as_bytes();
200    let mut col = 0;
201    // TODO: is it possible to use SIMD here???
202    for &b in src[..offset].iter().rev() {
203      if b == b'\n' {
204        break;
205      }
206      // https://en.wikipedia.org/wiki/UTF-8#Description
207      if b & 0b1100_0000 != 0b1000_0000 {
208        col += 1;
209      }
210    }
211    col
212  }
213}
214
215#[cfg(test)]
216mod test {
217  use super::*;
218  use crate::language::{Language, Tsx};
219
220  fn parse(src: &str) -> Result<Tree, TSParseError> {
221    parse_lang(|p| p.parse(src, None), Tsx.get_ts_language())
222  }
223
224  #[test]
225  fn test_tree_sitter() -> Result<(), TSParseError> {
226    let tree = parse("var a = 1234")?;
227    let root_node = tree.root_node();
228    assert_eq!(root_node.kind(), "program");
229    assert_eq!(root_node.start_position().column(), 0);
230    assert_eq!(root_node.end_position().column(), 12);
231    assert_eq!(
232      root_node.to_sexp(),
233      "(program (variable_declaration (variable_declarator name: (identifier) value: (number))))"
234    );
235    Ok(())
236  }
237
238  #[test]
239  fn test_object_literal() -> Result<(), TSParseError> {
240    let tree = parse("{a: $X}")?;
241    let root_node = tree.root_node();
242    // wow this is not label. technically it is wrong but practically it is better LOL
243    assert_eq!(root_node.to_sexp(), "(program (expression_statement (object (pair key: (property_identifier) value: (identifier)))))");
244    Ok(())
245  }
246
247  #[test]
248  fn test_string() -> Result<(), TSParseError> {
249    let tree = parse("'$A'")?;
250    let root_node = tree.root_node();
251    assert_eq!(
252      root_node.to_sexp(),
253      "(program (expression_statement (string (string_fragment))))"
254    );
255    Ok(())
256  }
257
258  #[test]
259  fn test_row_col() -> Result<(), TSParseError> {
260    let tree = parse("😄")?;
261    let root = tree.root_node();
262    assert_eq!(root.start_position(), Point::new(0, 0));
263    // NOTE: Point in tree-sitter is counted in bytes instead of char
264    assert_eq!(root.end_position(), Point::new(0, 4));
265    Ok(())
266  }
267
268  #[test]
269  fn test_edit() -> Result<(), TSParseError> {
270    let mut src = "a + b".to_string();
271    let mut tree = parse(&src)?;
272    let _ = perform_edit(
273      &mut tree,
274      &mut src,
275      &Edit {
276        position: 1,
277        deleted_length: 0,
278        inserted_text: " * b".into(),
279      },
280    );
281    let tree2 = parse_lang(|p| p.parse(&src, Some(&tree)), Tsx.get_ts_language())?;
282    assert_eq!(
283      tree.root_node().to_sexp(),
284      "(program (expression_statement (binary_expression left: (identifier) right: (identifier))))"
285    );
286    assert_eq!(tree2.root_node().to_sexp(), "(program (expression_statement (binary_expression left: (binary_expression left: (identifier) right: (identifier)) right: (identifier))))");
287    Ok(())
288  }
289}