Skip to main content

ast_grep_language/
lib.rs

1//! This module defines the supported programming languages for ast-grep.
2//!
3//! It provides a set of customized languages with expando_char / pre_process_pattern,
4//! and a set of stub languages without preprocessing.
5//! A rule of thumb: if your language does not accept identifiers like `$VAR`.
6//! You need use `impl_lang_expando!` macro and a standalone file for testing.
7//! Otherwise, you can define it as a stub language using `impl_lang!`.
8//! To see the full list of languages, visit `<https://ast-grep.github.io/reference/languages.html>`
9//!
10//! ```
11//! use ast_grep_language::{LanguageExt, SupportLang};
12//!
13//! let lang: SupportLang = "rs".parse().unwrap();
14//! let src = "fn foo() {}";
15//! let root = lang.ast_grep(src);
16//! let found = root.root().find_all("fn $FNAME() {}").next().unwrap();
17//! assert_eq!(found.start_pos().line(), 0);
18//! assert_eq!(found.text(), "fn foo() {}");
19//! ```
20
21mod bash;
22mod cpp;
23mod csharp;
24mod css;
25mod elixir;
26mod go;
27mod haskell;
28mod hcl;
29mod html;
30mod json;
31mod kotlin;
32mod lua;
33mod nix;
34mod parsers;
35mod php;
36mod python;
37mod ruby;
38mod rust;
39mod scala;
40mod solidity;
41mod swift;
42mod yaml;
43
44use ast_grep_core::matcher::{Pattern, PatternBuilder, PatternError};
45pub use html::Html;
46
47use ast_grep_core::meta_var::MetaVariable;
48use ast_grep_core::tree_sitter::{StrDoc, TSLanguage, TSRange};
49use ast_grep_core::Node;
50use ignore::types::{Types, TypesBuilder};
51use serde::de::Visitor;
52use serde::{de, Deserialize, Deserializer, Serialize};
53use std::borrow::Cow;
54use std::fmt;
55use std::fmt::{Display, Formatter};
56use std::iter::repeat;
57use std::path::Path;
58use std::str::FromStr;
59
60pub use ast_grep_core::language::Language;
61pub use ast_grep_core::tree_sitter::LanguageExt;
62
63/// this macro implements bare-bone methods for a language
64macro_rules! impl_lang {
65  ($lang: ident, $func: ident) => {
66    #[derive(Clone, Copy, Debug)]
67    pub struct $lang;
68    impl Language for $lang {
69      fn kind_to_id(&self, kind: &str) -> u16 {
70        self
71          .get_ts_language()
72          .id_for_node_kind(kind, /*named*/ true)
73      }
74      fn field_to_id(&self, field: &str) -> Option<u16> {
75        self
76          .get_ts_language()
77          .field_id_for_name(field)
78          .map(|f| f.get())
79      }
80      fn build_pattern(&self, builder: &PatternBuilder) -> Result<Pattern, PatternError> {
81        builder.build(|src| StrDoc::try_new(src, self.clone()))
82      }
83    }
84    impl LanguageExt for $lang {
85      fn get_ts_language(&self) -> TSLanguage {
86        parsers::$func().into()
87      }
88    }
89  };
90}
91
92fn pre_process_pattern(expando: char, query: &str) -> std::borrow::Cow<'_, str> {
93  let mut ret = Vec::with_capacity(query.len());
94  let mut dollar_count = 0;
95  for c in query.chars() {
96    if c == '$' {
97      dollar_count += 1;
98      continue;
99    }
100    let need_replace = matches!(c, 'A'..='Z' | '_') // $A or $$A or $$$A
101      || dollar_count == 3; // anonymous multiple
102    let sigil = if need_replace { expando } else { '$' };
103    ret.extend(repeat(sigil).take(dollar_count));
104    dollar_count = 0;
105    ret.push(c);
106  }
107  // trailing anonymous multiple
108  let sigil = if dollar_count == 3 { expando } else { '$' };
109  ret.extend(repeat(sigil).take(dollar_count));
110  std::borrow::Cow::Owned(ret.into_iter().collect())
111}
112
113/// this macro will implement expando_char and pre_process_pattern
114/// use this if your language does not accept $ as valid identifier char
115macro_rules! impl_lang_expando {
116  ($lang: ident, $func: ident, $char: expr) => {
117    #[derive(Clone, Copy, Debug)]
118    pub struct $lang;
119    impl Language for $lang {
120      fn kind_to_id(&self, kind: &str) -> u16 {
121        self
122          .get_ts_language()
123          .id_for_node_kind(kind, /*named*/ true)
124      }
125      fn field_to_id(&self, field: &str) -> Option<u16> {
126        self
127          .get_ts_language()
128          .field_id_for_name(field)
129          .map(|f| f.get())
130      }
131      fn expando_char(&self) -> char {
132        $char
133      }
134      fn pre_process_pattern<'q>(&self, query: &'q str) -> std::borrow::Cow<'q, str> {
135        pre_process_pattern(self.expando_char(), query)
136      }
137      fn build_pattern(&self, builder: &PatternBuilder) -> Result<Pattern, PatternError> {
138        builder.build(|src| StrDoc::try_new(src, self.clone()))
139      }
140    }
141    impl LanguageExt for $lang {
142      fn get_ts_language(&self) -> TSLanguage {
143        $crate::parsers::$func().into()
144      }
145    }
146  };
147}
148
149pub trait Alias: Display {
150  const ALIAS: &'static [&'static str];
151}
152
153/// Implements the `ALIAS` associated constant for the given lang, which is
154/// then used to define the `alias` const fn and a `Deserialize` impl.
155macro_rules! impl_alias {
156  ($lang:ident => $as:expr) => {
157    impl Alias for $lang {
158      const ALIAS: &'static [&'static str] = $as;
159    }
160
161    impl fmt::Display for $lang {
162      fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
163        write!(f, "{:?}", self)
164      }
165    }
166
167    impl<'de> Deserialize<'de> for $lang {
168      fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
169      where
170        D: Deserializer<'de>,
171      {
172        let vis = AliasVisitor {
173          aliases: Self::ALIAS,
174        };
175        deserializer.deserialize_str(vis)?;
176        Ok($lang)
177      }
178    }
179
180    impl From<$lang> for SupportLang {
181      fn from(_: $lang) -> Self {
182        Self::$lang
183      }
184    }
185  };
186}
187/// Generates as convenience conversions between the lang types
188/// and `SupportedType`.
189macro_rules! impl_aliases {
190  ($($lang:ident => $as:expr),* $(,)?) => {
191    $(impl_alias!($lang => $as);)*
192    const fn alias(lang: SupportLang) -> &'static [&'static str] {
193      match lang {
194        $(SupportLang::$lang => $lang::ALIAS),*
195      }
196    }
197  };
198}
199
200/* Customized Language with expando_char / pre_process_pattern */
201// https://en.cppreference.com/w/cpp/language/identifiers
202impl_lang_expando!(C, language_c, '๐€€');
203impl_lang_expando!(Cpp, language_cpp, '๐€€');
204// https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/language-specification/lexical-structure#643-identifiers
205// all letter number is accepted
206// https://www.compart.com/en/unicode/category/Nl
207impl_lang_expando!(CSharp, language_c_sharp, 'ยต');
208// https://www.w3.org/TR/CSS21/grammar.html#scanner
209impl_lang_expando!(Css, language_css, '_');
210// https://github.com/elixir-lang/tree-sitter-elixir/blob/a2861e88a730287a60c11ea9299c033c7d076e30/grammar.js#L245
211impl_lang_expando!(Elixir, language_elixir, 'ยต');
212// we can use any Unicode code point categorized as "Letter"
213// https://go.dev/ref/spec#letter
214impl_lang_expando!(Go, language_go, 'ยต');
215// GHC supports Unicode syntax per
216// https://ghc.gitlab.haskell.org/ghc/doc/users_guide/exts/unicode_syntax.html
217// and the tree-sitter-haskell grammar parses it too.
218impl_lang_expando!(Haskell, language_haskell, 'ยต');
219// https://developer.hashicorp.com/terraform/language/syntax/configuration#identifiers
220impl_lang_expando!(Hcl, language_hcl, 'ยต');
221// https://github.com/fwcd/tree-sitter-kotlin/pull/93
222impl_lang_expando!(Kotlin, language_kotlin, 'ยต');
223// Nix uses $ for string interpolation (e.g., "${pkgs.hello}")
224impl_lang_expando!(Nix, language_nix, '_');
225// PHP accepts unicode to be used as some name not var name though
226impl_lang_expando!(Php, language_php, 'ยต');
227// we can use any char in unicode range [:XID_Start:]
228// https://docs.python.org/3/reference/lexical_analysis.html#identifiers
229// see also [PEP 3131](https://peps.python.org/pep-3131/) for further details.
230impl_lang_expando!(Python, language_python, 'ยต');
231// https://github.com/tree-sitter/tree-sitter-ruby/blob/f257f3f57833d584050336921773738a3fd8ca22/grammar.js#L30C26-L30C78
232impl_lang_expando!(Ruby, language_ruby, 'ยต');
233// we can use any char in unicode range [:XID_Start:]
234// https://doc.rust-lang.org/reference/identifiers.html
235impl_lang_expando!(Rust, language_rust, 'ยต');
236//https://docs.swift.org/swift-book/documentation/the-swift-programming-language/lexicalstructure/#Identifiers
237impl_lang_expando!(Swift, language_swift, 'ยต');
238
239// Stub Language without preprocessing
240// Language Name, tree-sitter-name, alias, extension
241impl_lang!(Bash, language_bash);
242impl_lang!(Java, language_java);
243impl_lang!(JavaScript, language_javascript);
244impl_lang!(Json, language_json);
245impl_lang!(Lua, language_lua);
246impl_lang!(Scala, language_scala);
247impl_lang!(Solidity, language_solidity);
248impl_lang!(Tsx, language_tsx);
249impl_lang!(TypeScript, language_typescript);
250impl_lang!(Yaml, language_yaml);
251// See ripgrep for extensions
252// https://github.com/BurntSushi/ripgrep/blob/master/crates/ignore/src/default_types.rs
253
254/// Represents all built-in languages.
255#[derive(Clone, Copy, Debug, PartialEq, Eq, Serialize, Hash)]
256pub enum SupportLang {
257  Bash,
258  C,
259  Cpp,
260  CSharp,
261  Css,
262  Go,
263  Elixir,
264  Haskell,
265  Hcl,
266  Html,
267  Java,
268  JavaScript,
269  Json,
270  Kotlin,
271  Lua,
272  Nix,
273  Php,
274  Python,
275  Ruby,
276  Rust,
277  Scala,
278  Solidity,
279  Swift,
280  Tsx,
281  TypeScript,
282  Yaml,
283}
284
285impl SupportLang {
286  pub const fn all_langs() -> &'static [SupportLang] {
287    use SupportLang::*;
288    &[
289      Bash, C, Cpp, CSharp, Css, Elixir, Go, Haskell, Hcl, Html, Java, JavaScript, Json, Kotlin,
290      Lua, Nix, Php, Python, Ruby, Rust, Scala, Solidity, Swift, Tsx, TypeScript, Yaml,
291    ]
292  }
293
294  pub fn file_types(&self) -> Types {
295    file_types(*self)
296  }
297}
298
299impl fmt::Display for SupportLang {
300  fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
301    write!(f, "{self:?}")
302  }
303}
304
305#[derive(Debug)]
306pub enum SupportLangErr {
307  LanguageNotSupported(String),
308}
309
310impl Display for SupportLangErr {
311  fn fmt(&self, f: &mut Formatter<'_>) -> Result<(), std::fmt::Error> {
312    use SupportLangErr::*;
313    match self {
314      LanguageNotSupported(lang) => write!(f, "{lang} is not supported!"),
315    }
316  }
317}
318
319impl std::error::Error for SupportLangErr {}
320
321impl<'de> Deserialize<'de> for SupportLang {
322  fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
323  where
324    D: Deserializer<'de>,
325  {
326    deserializer.deserialize_str(SupportLangVisitor)
327  }
328}
329
330struct SupportLangVisitor;
331
332impl Visitor<'_> for SupportLangVisitor {
333  type Value = SupportLang;
334
335  fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
336    f.write_str("SupportLang")
337  }
338
339  fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
340  where
341    E: de::Error,
342  {
343    v.parse().map_err(de::Error::custom)
344  }
345}
346struct AliasVisitor {
347  aliases: &'static [&'static str],
348}
349
350impl Visitor<'_> for AliasVisitor {
351  type Value = &'static str;
352
353  fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
354    write!(f, "one of {:?}", self.aliases)
355  }
356
357  fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
358  where
359    E: de::Error,
360  {
361    self
362      .aliases
363      .iter()
364      .copied()
365      .find(|&a| v.eq_ignore_ascii_case(a))
366      .ok_or_else(|| de::Error::invalid_value(de::Unexpected::Str(v), &self))
367  }
368}
369
370impl_aliases! {
371  Bash => &["bash"],
372  C => &["c"],
373  Cpp => &["cc", "c++", "cpp", "cxx"],
374  CSharp => &["cs", "csharp"],
375  Css => &["css"],
376  Elixir => &["ex", "elixir"],
377  Go => &["go", "golang"],
378  Haskell => &["hs", "haskell"],
379  Hcl => &["hcl"],
380  Html => &["html"],
381  Java => &["java"],
382  JavaScript => &["javascript", "js", "jsx"],
383  Json => &["json"],
384  Kotlin => &["kotlin", "kt"],
385  Lua => &["lua"],
386  Nix => &["nix"],
387  Php => &["php"],
388  Python => &["py", "python"],
389  Ruby => &["rb", "ruby"],
390  Rust => &["rs", "rust"],
391  Scala => &["scala"],
392  Solidity => &["sol", "solidity"],
393  Swift => &["swift"],
394  TypeScript => &["ts", "typescript"],
395  Tsx => &["tsx"],
396  Yaml => &["yaml", "yml"],
397}
398
399/// Implements the language names and aliases.
400impl FromStr for SupportLang {
401  type Err = SupportLangErr;
402  fn from_str(s: &str) -> Result<Self, Self::Err> {
403    for &lang in Self::all_langs() {
404      for moniker in alias(lang) {
405        if s.eq_ignore_ascii_case(moniker) {
406          return Ok(lang);
407        }
408      }
409    }
410    Err(SupportLangErr::LanguageNotSupported(s.to_string()))
411  }
412}
413
414macro_rules! execute_lang_method {
415  ($me: path, $method: ident, $($pname:tt),*) => {
416    use SupportLang as S;
417    match $me {
418      S::Bash => Bash.$method($($pname,)*),
419      S::C => C.$method($($pname,)*),
420      S::Cpp => Cpp.$method($($pname,)*),
421      S::CSharp => CSharp.$method($($pname,)*),
422      S::Css => Css.$method($($pname,)*),
423      S::Elixir => Elixir.$method($($pname,)*),
424      S::Go => Go.$method($($pname,)*),
425      S::Haskell => Haskell.$method($($pname,)*),
426      S::Hcl => Hcl.$method($($pname,)*),
427      S::Html => Html.$method($($pname,)*),
428      S::Java => Java.$method($($pname,)*),
429      S::JavaScript => JavaScript.$method($($pname,)*),
430      S::Json => Json.$method($($pname,)*),
431      S::Kotlin => Kotlin.$method($($pname,)*),
432      S::Lua => Lua.$method($($pname,)*),
433      S::Nix => Nix.$method($($pname,)*),
434      S::Php => Php.$method($($pname,)*),
435      S::Python => Python.$method($($pname,)*),
436      S::Ruby => Ruby.$method($($pname,)*),
437      S::Rust => Rust.$method($($pname,)*),
438      S::Scala => Scala.$method($($pname,)*),
439      S::Solidity => Solidity.$method($($pname,)*),
440      S::Swift => Swift.$method($($pname,)*),
441      S::Tsx => Tsx.$method($($pname,)*),
442      S::TypeScript => TypeScript.$method($($pname,)*),
443      S::Yaml => Yaml.$method($($pname,)*),
444    }
445  }
446}
447
448macro_rules! impl_lang_method {
449  ($method: ident, ($($pname:tt: $ptype:ty),*) => $return_type: ty) => {
450    #[inline]
451    fn $method(&self, $($pname: $ptype),*) -> $return_type {
452      execute_lang_method!{ self, $method, $($pname),* }
453    }
454  };
455}
456impl Language for SupportLang {
457  impl_lang_method!(kind_to_id, (kind: &str) => u16);
458  impl_lang_method!(field_to_id, (field: &str) => Option<u16>);
459  impl_lang_method!(meta_var_char, () => char);
460  impl_lang_method!(expando_char, () => char);
461  impl_lang_method!(extract_meta_var, (source: &str) => Option<MetaVariable>);
462  impl_lang_method!(build_pattern, (builder: &PatternBuilder) => Result<Pattern, PatternError>);
463  fn pre_process_pattern<'q>(&self, query: &'q str) -> Cow<'q, str> {
464    execute_lang_method! { self, pre_process_pattern, query }
465  }
466  fn from_path<P: AsRef<Path>>(path: P) -> Option<Self> {
467    from_extension(path.as_ref())
468  }
469}
470
471impl LanguageExt for SupportLang {
472  impl_lang_method!(get_ts_language, () => TSLanguage);
473  impl_lang_method!(injectable_languages, () => Option<&'static [&'static str]>);
474  fn extract_injections<L: LanguageExt>(
475    &self,
476    root: Node<StrDoc<L>>,
477  ) -> Vec<(String, Vec<TSRange>)> {
478    match self {
479      SupportLang::Html => Html.extract_injections(root),
480      _ => Vec::new(),
481    }
482  }
483}
484
485fn extensions(lang: SupportLang) -> &'static [&'static str] {
486  use SupportLang::*;
487  match lang {
488    Bash => &[
489      "bash", "bats", "cgi", "command", "env", "fcgi", "ksh", "sh", "tmux", "tool", "zsh",
490    ],
491    C => &["c", "h"],
492    Cpp => &["cc", "hpp", "cpp", "c++", "hh", "cxx", "cu", "ino"],
493    CSharp => &["cs"],
494    Css => &["css", "scss"],
495    Elixir => &["ex", "exs"],
496    Go => &["go"],
497    Haskell => &["hs"],
498    Hcl => &["hcl", "nomad", "tf", "tfvars", "workflow"],
499    Html => &["html", "htm", "xhtml"],
500    Java => &["java"],
501    JavaScript => &["cjs", "js", "mjs", "jsx"],
502    Json => &["json"],
503    Kotlin => &["kt", "ktm", "kts"],
504    Lua => &["lua"],
505    Nix => &["nix"],
506    Php => &["php"],
507    Python => &["py", "py3", "pyi", "bzl"],
508    Ruby => &["rb", "rbw", "gemspec"],
509    Rust => &["rs"],
510    Scala => &["scala", "sc", "sbt"],
511    Solidity => &["sol"],
512    Swift => &["swift"],
513    TypeScript => &["ts", "cts", "mts"],
514    Tsx => &["tsx"],
515    Yaml => &["yaml", "yml"],
516  }
517}
518
519/// Guess which programming language a file is written in
520/// Adapt from `<https://github.com/Wilfred/difftastic/blob/master/src/parse/guess_language.rs>`
521/// N.B do not confuse it with `FromStr` trait. This function is to guess language from file extension.
522fn from_extension(path: &Path) -> Option<SupportLang> {
523  let ext = path.extension()?.to_str()?;
524  SupportLang::all_langs()
525    .iter()
526    .copied()
527    .find(|&l| extensions(l).contains(&ext))
528}
529
530fn add_custom_file_type<'b>(
531  builder: &'b mut TypesBuilder,
532  file_type: &str,
533  suffix_list: &[&str],
534) -> &'b mut TypesBuilder {
535  for suffix in suffix_list {
536    let glob = format!("*.{suffix}");
537    builder
538      .add(file_type, &glob)
539      .expect("file pattern must compile");
540  }
541  builder.select(file_type)
542}
543
544fn file_types(lang: SupportLang) -> Types {
545  let mut builder = TypesBuilder::new();
546  let exts = extensions(lang);
547  let lang_name = lang.to_string();
548  add_custom_file_type(&mut builder, &lang_name, exts);
549  builder.build().expect("file type must be valid")
550}
551
552pub fn config_file_type() -> Types {
553  let mut builder = TypesBuilder::new();
554  let builder = add_custom_file_type(&mut builder, "yml", &["yml", "yaml"]);
555  builder.build().expect("yaml type must be valid")
556}
557
558#[cfg(test)]
559mod test {
560  use super::*;
561  use ast_grep_core::{matcher::MatcherExt, Pattern};
562
563  pub fn test_match_lang(query: &str, source: &str, lang: impl LanguageExt) {
564    let cand = lang.ast_grep(source);
565    let pattern = Pattern::new(query, lang);
566    assert!(
567      pattern.find_node(cand.root()).is_some(),
568      "goal: {pattern:?}, candidate: {}",
569      cand.root().get_inner_node().to_sexp(),
570    );
571  }
572
573  pub fn test_non_match_lang(query: &str, source: &str, lang: impl LanguageExt) {
574    let cand = lang.ast_grep(source);
575    let pattern = Pattern::new(query, lang);
576    assert!(
577      pattern.find_node(cand.root()).is_none(),
578      "goal: {pattern:?}, candidate: {}",
579      cand.root().get_inner_node().to_sexp(),
580    );
581  }
582
583  pub fn test_replace_lang(
584    src: &str,
585    pattern: &str,
586    replacer: &str,
587    lang: impl LanguageExt,
588  ) -> String {
589    let mut source = lang.ast_grep(src);
590    assert!(source
591      .replace(pattern, replacer)
592      .expect("should parse successfully"));
593    source.generate()
594  }
595
596  #[test]
597  fn test_js_string() {
598    test_match_lang("'a'", "'a'", JavaScript);
599    test_match_lang("\"\"", "\"\"", JavaScript);
600    test_match_lang("''", "''", JavaScript);
601  }
602
603  #[test]
604  fn test_guess_by_extension() {
605    let path = Path::new("foo.rs");
606    assert_eq!(from_extension(path), Some(SupportLang::Rust));
607  }
608
609  // TODO: add test for file_types
610}