Skip to main content

cartog_languages/
lib.rs

1//! Tree-sitter language extractors for the cartog code graph.
2//!
3//! Parses source code using tree-sitter grammars and extracts symbols and edges.
4//! Each language implements the [`Extractor`] trait with compiled S-expression
5//! queries for declarative AST pattern matching.
6//!
7//! Supported languages: Python, TypeScript, TSX, JavaScript, Rust, Go, Ruby, Java, PHP, Dart, Swift, Kotlin, Vue, Svelte, Astro, Markdown.
8#![doc = ""]
9#![doc = include_str!("../README.md")]
10
11pub mod dart;
12pub mod go;
13pub mod java;
14pub mod javascript;
15mod js_shared;
16pub mod kotlin;
17pub mod markdown;
18pub mod php;
19pub mod python;
20pub(crate) mod queries;
21pub mod ruby;
22pub mod rust_lang;
23pub mod sfc;
24pub mod swift;
25pub mod typescript;
26
27use anyhow::Result;
28use cartog_core::{Edge, Symbol};
29use tree_sitter::Node;
30
31/// Result of extracting symbols and edges from a source file.
32#[derive(Debug, Clone, Default)]
33pub struct ExtractionResult {
34    pub symbols: Vec<Symbol>,
35    pub edges: Vec<Edge>,
36}
37
38/// Trait implemented by each language extractor.
39///
40/// `extract` takes `&mut self` so implementations can reuse an internal
41/// `tree_sitter::Parser` across calls instead of allocating a new one per file.
42pub trait Extractor: Send {
43    fn extract(&mut self, source: &str, file_path: &str) -> Result<ExtractionResult>;
44}
45
46/// Extract the text of a tree-sitter node from the source.
47/// Returns an empty string if byte offsets fall outside the source or on a char boundary.
48pub(crate) fn node_text<'a>(node: Node, source: &'a str) -> &'a str {
49    source.get(node.start_byte()..node.end_byte()).unwrap_or("")
50}
51
52/// Last segment of `s` after the final `sep`, or all of `s` if `sep` is absent.
53/// Used to turn dotted/slashed import paths into a bare target name
54/// (`a.b.C` → `C`, `pkg/mod` → `mod`, `crate::path::Item` → `Item`).
55pub(crate) fn last_segment<'a>(s: &'a str, sep: &str) -> &'a str {
56    s.rsplit(sep).next().unwrap_or(s)
57}
58
59/// Qualified name: `Parent.name` when nested, else `name`.
60pub(crate) fn qualified(parent_qname: Option<&str>, name: &str) -> String {
61    match parent_qname {
62        Some(p) => format!("{p}.{name}"),
63        None => name.to_string(),
64    }
65}
66
67/// Deepest AST nesting a recursive extractor will descend before bailing. Recursive
68/// walkers use one stack frame per level; pathological/generated source would
69/// otherwise overflow the worker stack and abort the whole index run.
70pub(crate) const MAX_TREE_DEPTH: usize = 600;
71
72/// Iterative (non-recursive) check that the tree's max depth stays within `limit`.
73/// Uses a `TreeCursor` so it can't itself overflow on the very input it guards.
74pub(crate) fn tree_depth_exceeds(root: Node, limit: usize) -> bool {
75    let mut cursor = root.walk();
76    let mut depth = 0usize;
77    loop {
78        if depth > limit {
79            return true;
80        }
81        if cursor.goto_first_child() {
82            depth += 1;
83            continue;
84        }
85        loop {
86            if cursor.goto_next_sibling() {
87                break;
88            }
89            if !cursor.goto_parent() {
90                return false;
91            }
92            depth -= 1;
93        }
94    }
95}
96
97/// Enclosing scope while extracting: `id` becomes the child's `parent_id`,
98/// `qname` its `parent_name`. Top level: `id` is `None`, `qname` the namespace.
99#[derive(Clone, Copy, Default)]
100pub(crate) struct ParentScope<'a> {
101    pub id: Option<&'a str>,
102    pub qname: Option<&'a str>,
103}
104
105impl<'a> ParentScope<'a> {
106    /// Top-level scope, optionally within a namespace.
107    pub fn top_level(namespace: Option<&'a str>) -> Self {
108        Self {
109            id: None,
110            qname: namespace,
111        }
112    }
113
114    /// Scope nested inside a symbol identified by `id` + `qname`.
115    pub fn nested(id: &'a str, qname: &'a str) -> Self {
116        Self {
117            id: Some(id),
118            qname: Some(qname),
119        }
120    }
121}
122
123pub use cartog_core::detect_language;
124
125/// Get the extractor for a language name.
126pub fn get_extractor(language: &str) -> Option<Box<dyn Extractor>> {
127    match language {
128        "python" => Some(Box::new(python::PythonExtractor::new())),
129        "typescript" => Some(Box::new(typescript::TypeScriptExtractor::new())),
130        "tsx" => Some(Box::new(typescript::TsxExtractor::new())),
131        "javascript" => Some(Box::new(javascript::JavaScriptExtractor::new())),
132        "rust" => Some(Box::new(rust_lang::RustExtractor::new())),
133        "go" => Some(Box::new(go::GoExtractor::new())),
134        "ruby" => Some(Box::new(ruby::RubyExtractor::new())),
135        "java" => Some(Box::new(java::JavaExtractor::new())),
136        "php" => Some(Box::new(php::PhpExtractor::new())),
137        "dart" => Some(Box::new(dart::DartExtractor::new())),
138        "swift" => Some(Box::new(swift::SwiftExtractor::new())),
139        "kotlin" => Some(Box::new(kotlin::KotlinExtractor::new())),
140        "vue" => Some(Box::new(sfc::VueExtractor::new())),
141        "svelte" => Some(Box::new(sfc::SvelteExtractor::new())),
142        "astro" => Some(Box::new(sfc::AstroExtractor::new())),
143        "markdown" => Some(Box::new(markdown::MarkdownExtractor::new())),
144        _ => None,
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151
152    #[test]
153    fn test_get_extractor() {
154        assert!(get_extractor("python").is_some());
155        assert!(get_extractor("typescript").is_some());
156        assert!(get_extractor("tsx").is_some());
157        assert!(get_extractor("javascript").is_some());
158        assert!(get_extractor("rust").is_some());
159        assert!(get_extractor("go").is_some());
160        assert!(get_extractor("ruby").is_some());
161        assert!(get_extractor("java").is_some());
162        assert!(get_extractor("php").is_some());
163        assert!(get_extractor("dart").is_some());
164        assert!(get_extractor("swift").is_some());
165        assert!(get_extractor("kotlin").is_some());
166        assert!(get_extractor("vue").is_some());
167        assert!(get_extractor("svelte").is_some());
168        assert!(get_extractor("astro").is_some());
169        assert!(get_extractor("markdown").is_some());
170        assert!(get_extractor("unknown").is_none());
171    }
172
173    const ALL_LANGS: [&str; 16] = [
174        "python",
175        "typescript",
176        "tsx",
177        "javascript",
178        "rust",
179        "go",
180        "ruby",
181        "java",
182        "php",
183        "dart",
184        "swift",
185        "kotlin",
186        "vue",
187        "svelte",
188        "astro",
189        "markdown",
190    ];
191
192    proptest::proptest! {
193        /// No extractor panics on arbitrary source — indexing whole repos must
194        /// degrade (return Ok/Err), never abort the run. Covers unicode, control
195        /// chars, and unbalanced delimiters via the regex generator.
196        ///
197        /// Extractors are built once per thread and reused across cases (the
198        /// trait takes `&mut self` for exactly this); building all 13 per case
199        /// would recompile every tree-sitter query thousands of times.
200        #[test]
201        fn extractors_never_panic_on_arbitrary_source(src in ".{0,400}") {
202            thread_local! {
203                static EXTRACTORS: std::cell::RefCell<Vec<(&'static str, Box<dyn Extractor>)>> =
204                    std::cell::RefCell::new(
205                        ALL_LANGS.iter().map(|&l| (l, get_extractor(l).unwrap())).collect()
206                    );
207            }
208            EXTRACTORS.with_borrow_mut(|exs| {
209                for (lang, ex) in exs.iter_mut() {
210                    let _ = ex.extract(&src, &format!("fuzz.{lang}"));
211                }
212            });
213        }
214    }
215}