Skip to main content

parsentry_parser/
parser.rs

1//! Code parser using tree-sitter.
2
3use anyhow::{Result, anyhow};
4use std::collections::HashMap;
5use std::fs;
6use std::path::{Path, PathBuf};
7use streaming_iterator::StreamingIterator;
8use tree_sitter::{Language, Node, Parser, Query, QueryCursor};
9
10/// A code definition (function, class, method, etc.)
11#[derive(Debug, Clone)]
12pub struct Definition {
13    pub name: String,
14    pub start_byte: usize,
15    pub end_byte: usize,
16    pub source: String,
17    pub file_path: Option<PathBuf>,
18    pub line_number: Option<usize>,
19}
20
21/// Context containing definitions and references from parsed code.
22#[derive(Debug, Clone)]
23pub struct Context {
24    pub definitions: Vec<Definition>,
25    pub references: Vec<Definition>,
26}
27
28/// Tree-sitter based code parser.
29pub struct CodeParser {
30    pub files: HashMap<PathBuf, String>,
31    pub parser: Parser,
32}
33
34impl CodeParser {
35    /// Create a new code parser.
36    pub fn new() -> Result<Self> {
37        Ok(Self {
38            files: HashMap::new(),
39            parser: Parser::new(),
40        })
41    }
42
43    /// Add a file to the parser.
44    pub fn add_file(&mut self, path: &Path) -> Result<()> {
45        let content = fs::read_to_string(path)
46            .map_err(|e| anyhow!("Failed to read file: {}: {}", path.display(), e))?;
47        self.files.insert(path.to_path_buf(), content);
48        Ok(())
49    }
50
51    /// Get the tree-sitter language for a file based on its extension.
52    #[must_use]
53    pub fn get_language(&self, path: &Path) -> Option<Language> {
54        let extension = path.extension().and_then(|ext| ext.to_str());
55        match extension {
56            Some("c") | Some("h") => Some(tree_sitter_c::LANGUAGE.into()),
57            Some("cpp") | Some("cxx") | Some("cc") | Some("hpp") | Some("hxx") => {
58                Some(tree_sitter_cpp::LANGUAGE.into())
59            }
60            Some("py") => Some(tree_sitter_python::LANGUAGE.into()),
61            Some("js") => Some(tree_sitter_javascript::LANGUAGE.into()),
62            Some("ts") => Some(tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()),
63            Some("tsx") => Some(tree_sitter_typescript::LANGUAGE_TSX.into()),
64            Some("java") => Some(tree_sitter_java::LANGUAGE.into()),
65            Some("rs") => Some(tree_sitter_rust::LANGUAGE.into()),
66            Some("go") => Some(tree_sitter_go::LANGUAGE.into()),
67            Some("rb") => Some(tree_sitter_ruby::LANGUAGE.into()),
68            Some("tf") | Some("hcl") => Some(tree_sitter_hcl::LANGUAGE.into()),
69            Some("php") | Some("php3") | Some("php4") | Some("php5") | Some("phtml") => {
70                Some(tree_sitter_php::LANGUAGE_PHP.into())
71            }
72            _ => None,
73        }
74    }
75
76    /// Convert a tree-sitter Language to its string name.
77    fn language_to_name(language: &Language) -> Option<&'static str> {
78        let ts_c: Language = tree_sitter_c::LANGUAGE.into();
79        let ts_cpp: Language = tree_sitter_cpp::LANGUAGE.into();
80        let ts_python: Language = tree_sitter_python::LANGUAGE.into();
81        let ts_javascript: Language = tree_sitter_javascript::LANGUAGE.into();
82        let ts_typescript: Language = tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into();
83        let ts_tsx: Language = tree_sitter_typescript::LANGUAGE_TSX.into();
84        let ts_java: Language = tree_sitter_java::LANGUAGE.into();
85        let ts_go: Language = tree_sitter_go::LANGUAGE.into();
86        let ts_rust: Language = tree_sitter_rust::LANGUAGE.into();
87        let ts_ruby: Language = tree_sitter_ruby::LANGUAGE.into();
88        let ts_hcl: Language = tree_sitter_hcl::LANGUAGE.into();
89        let ts_php: Language = tree_sitter_php::LANGUAGE_PHP.into();
90
91        if language == &ts_c {
92            Some("c")
93        } else if language == &ts_cpp {
94            Some("cpp")
95        } else if language == &ts_python {
96            Some("python")
97        } else if language == &ts_javascript {
98            Some("javascript")
99        } else if language == &ts_typescript || language == &ts_tsx {
100            Some("typescript")
101        } else if language == &ts_java {
102            Some("java")
103        } else if language == &ts_go {
104            Some("go")
105        } else if language == &ts_rust {
106            Some("rust")
107        } else if language == &ts_ruby {
108            Some("ruby")
109        } else if language == &ts_hcl {
110            Some("terraform")
111        } else if language == &ts_php {
112            Some("php")
113        } else {
114            None
115        }
116    }
117
118    /// Get query content for a specific language and query type.
119    pub fn get_query_content(&self, language: &Language, query_name: &str) -> Result<&'static str> {
120        let lang_name = Self::language_to_name(language)
121            .ok_or_else(|| anyhow!("Unsupported language for queries"))?;
122
123        if query_name.contains('/') || query_name.contains('\\') || query_name.contains("..") {
124            return Err(anyhow!("Invalid query name: {}", query_name));
125        }
126
127        let query_content = match (lang_name, query_name) {
128            ("c", "definitions") => include_str!("queries/c/definitions.scm"),
129            ("c", "calls") => include_str!("queries/c/calls.scm"),
130            ("cpp", "definitions") => include_str!("queries/cpp/definitions.scm"),
131            ("cpp", "calls") => include_str!("queries/cpp/calls.scm"),
132            ("python", "definitions") => include_str!("queries/python/definitions.scm"),
133            ("python", "calls") => include_str!("queries/python/calls.scm"),
134            ("javascript", "definitions") => include_str!("queries/javascript/definitions.scm"),
135            ("javascript", "calls") => include_str!("queries/javascript/calls.scm"),
136            ("typescript", "definitions") => include_str!("queries/typescript/definitions.scm"),
137            ("typescript", "calls") => include_str!("queries/typescript/calls.scm"),
138            ("java", "definitions") => include_str!("queries/java/definitions.scm"),
139            ("java", "calls") => include_str!("queries/java/calls.scm"),
140            ("go", "definitions") => include_str!("queries/go/definitions.scm"),
141            ("go", "calls") => include_str!("queries/go/calls.scm"),
142            ("rust", "definitions") => include_str!("queries/rust/definitions.scm"),
143            ("rust", "calls") => include_str!("queries/rust/calls.scm"),
144            ("ruby", "definitions") => include_str!("queries/ruby/definitions.scm"),
145            ("ruby", "calls") => include_str!("queries/ruby/calls.scm"),
146            ("terraform", "definitions") => include_str!("queries/terraform/definitions.scm"),
147            ("terraform", "calls") => include_str!("queries/terraform/calls.scm"),
148            ("php", "definitions") => include_str!("queries/php/definitions.scm"),
149            ("php", "calls") => include_str!("queries/php/calls.scm"),
150            (_, query) => return Err(anyhow!("Unsupported query: {} for {}", query, lang_name)),
151        };
152
153        Ok(query_content)
154    }
155
156    /// Find a definition by name in a specific file.
157    pub fn find_definition(
158        &mut self,
159        name: &str,
160        source_file: &Path,
161    ) -> Result<Option<(PathBuf, Definition)>> {
162        let content = self
163            .files
164            .get(source_file)
165            .ok_or_else(|| anyhow!("File not found in parser: {}", source_file.display()))?;
166
167        let language = match self.get_language(source_file) {
168            Some(lang) => lang,
169            None => return Ok(None),
170        };
171
172        self.parser
173            .set_language(&language)
174            .map_err(|e| anyhow!("Failed to set language: {}", e))?;
175
176        let tree = self
177            .parser
178            .parse(content, None)
179            .ok_or_else(|| anyhow!("Failed to parse file: {}", source_file.display()))?;
180
181        let query_str = self.get_query_content(&language, "definitions")?;
182
183        let query = Query::new(&language, query_str)
184            .map_err(|e| anyhow!("Failed to create query: {}", e))?;
185
186        let mut query_cursor = QueryCursor::new();
187        let mut matches = query_cursor.matches(&query, tree.root_node(), content.as_bytes());
188
189        while let Some(mat) = matches.next() {
190            let mut definition_node: Option<Node> = None;
191            let mut name_node: Option<Node> = None;
192
193            for cap in mat.captures {
194                let capture_name = &query.capture_names()[cap.index as usize];
195                match capture_name {
196                    s if *s == "definition" => definition_node = Some(cap.node),
197                    s if *s == "name" => name_node = Some(cap.node),
198                    _ => {}
199                }
200            }
201
202            if let (Some(def_node), Some(name_node_inner)) = (definition_node, name_node)
203                && name_node_inner.utf8_text(content.as_bytes())? == name
204            {
205                let start_byte = def_node.start_byte();
206                let end_byte = def_node.end_byte();
207                let source = def_node.utf8_text(content.as_bytes())?.to_string();
208
209                let line_number = content[..start_byte].matches('\n').count() + 1;
210                let definition = Definition {
211                    name: name.to_string(),
212                    start_byte,
213                    end_byte,
214                    source,
215                    file_path: Some(source_file.to_path_buf()),
216                    line_number: Some(line_number),
217                };
218                return Ok(Some((source_file.to_path_buf(), definition)));
219            }
220        }
221
222        Ok(None)
223    }
224
225    /// Find all calls to a function/method by name across all loaded files.
226    pub fn find_calls(&mut self, name: &str) -> Result<Vec<(PathBuf, Definition, String)>> {
227        let mut results = Vec::new();
228
229        for (file_path, content) in &self.files {
230            let language = match self.get_language(file_path) {
231                Some(lang) => lang,
232                None => continue,
233            };
234
235            self.parser.set_language(&language).map_err(|e| {
236                anyhow!("Failed to set language for {}: {}", file_path.display(), e)
237            })?;
238
239            let tree = match self.parser.parse(content, None) {
240                Some(t) => t,
241                None => {
242                    eprintln!("Warning: Failed to parse file: {}", file_path.display());
243                    continue;
244                }
245            };
246
247            let query_str = match self.get_query_content(&language, "calls") {
248                Ok(s) => s,
249                Err(e) => {
250                    eprintln!(
251                        "Warning: Failed to get calls query for {}: {}",
252                        file_path.display(),
253                        e
254                    );
255                    continue;
256                }
257            };
258
259            let query = match Query::new(&language, query_str) {
260                Ok(q) => q,
261                Err(e) => {
262                    eprintln!("Warning: Failed to create calls query: {}", e);
263                    continue;
264                }
265            };
266
267            let mut query_cursor = QueryCursor::new();
268            let mut matches = query_cursor.matches(&query, tree.root_node(), content.as_bytes());
269
270            while let Some(mat) = matches.next() {
271                for cap in mat.captures {
272                    let capture_name = query.capture_names()[cap.index as usize];
273                    let valid_captures = [
274                        "direct_call",
275                        "method_call",
276                        "macro_call",
277                        "reference",
278                        "callback",
279                        "import",
280                        "assignment",
281                    ];
282
283                    if valid_captures.contains(&capture_name) {
284                        let node = cap.node;
285                        if node.utf8_text(content.as_bytes())? == name {
286                            let start_byte = node.start_byte();
287                            let end_byte = node.end_byte();
288                            let source = name.to_string();
289                            let line_number = content[..start_byte].matches('\n').count() + 1;
290
291                            results.push((
292                                file_path.clone(),
293                                Definition {
294                                    name: name.to_string(),
295                                    start_byte,
296                                    end_byte,
297                                    source,
298                                    file_path: Some(file_path.clone()),
299                                    line_number: Some(line_number),
300                                },
301                                capture_name.to_string(),
302                            ));
303                        }
304                    }
305                }
306            }
307        }
308
309        Ok(results)
310    }
311
312    /// Find both definitions and references for bidirectional tracking.
313    pub fn find_bidirectional(
314        &mut self,
315        name: &str,
316        source_file: &Path,
317    ) -> Result<Vec<(PathBuf, Definition)>> {
318        let mut results = Vec::new();
319
320        if let Some(definition) = self.find_definition(name, source_file)? {
321            results.push(definition);
322        }
323
324        let calls = self.find_calls(name)?;
325        results.extend(calls.into_iter().map(|(path, def, _)| (path, def)));
326
327        results.sort_by_key(|(path, def)| (path.clone(), def.start_byte));
328        results.dedup_by_key(|(path, def)| (path.clone(), def.start_byte));
329
330        Ok(results)
331    }
332
333    /// Build context (definitions and references) from a file.
334    pub fn build_context_from_file(&mut self, start_path: &Path) -> Result<Context> {
335        use std::collections::HashSet;
336
337        let mut collected: HashSet<String> = HashSet::new();
338        let mut definitions: Vec<Definition> = Vec::new();
339        let mut references: Vec<Definition> = Vec::new();
340
341        let file_content = self
342            .files
343            .get(start_path)
344            .ok_or_else(|| anyhow!("File not found: {}", start_path.display()))?;
345
346        let language = match self.get_language(start_path) {
347            Some(lang) => lang,
348            None => {
349                return Ok(Context {
350                    definitions: Vec::new(),
351                    references: Vec::new(),
352                });
353            }
354        };
355
356        self.parser
357            .set_language(&language)
358            .map_err(|e| anyhow!("Failed to set language: {}", e))?;
359
360        let tree = self
361            .parser
362            .parse(file_content, None)
363            .ok_or_else(|| anyhow!("Failed to parse: {}", start_path.display()))?;
364
365        let definitions_query_str = self.get_query_content(&language, "definitions")?;
366        let definitions_query = Query::new(&language, definitions_query_str)?;
367
368        let mut query_cursor = QueryCursor::new();
369        let mut matches = query_cursor.matches(
370            &definitions_query,
371            tree.root_node(),
372            file_content.as_bytes(),
373        );
374
375        let mut to_visit: Vec<(PathBuf, String)> = Vec::new();
376
377        while let Some(mat) = matches.next() {
378            let mut def_node: Option<Node> = None;
379            let mut name_node: Option<Node> = None;
380            for cap in mat.captures {
381                let capture_name = &definitions_query.capture_names()[cap.index as usize];
382                match &capture_name[..] {
383                    "definition" => def_node = Some(cap.node),
384                    "name" => name_node = Some(cap.node),
385                    _ => {}
386                }
387            }
388            if let (Some(def_node), Some(name_node)) = (def_node, name_node) {
389                let name = name_node.utf8_text(file_content.as_bytes())?.to_string();
390                if !collected.contains(&name) {
391                    let start_byte = def_node.start_byte();
392                    let end_byte = def_node.end_byte();
393                    let source = def_node.utf8_text(file_content.as_bytes())?.to_string();
394                    let line_number = file_content[..start_byte].matches('\n').count() + 1;
395                    definitions.push(Definition {
396                        name: name.clone(),
397                        start_byte,
398                        end_byte,
399                        source,
400                        file_path: Some(start_path.to_path_buf()),
401                        line_number: Some(line_number),
402                    });
403                    collected.insert(name.clone());
404                    to_visit.push((start_path.to_path_buf(), name));
405                }
406            }
407        }
408
409        let references_query_str = match self.get_query_content(&language, "calls") {
410            Ok(s) => s,
411            Err(_) => {
412                return Ok(Context {
413                    definitions,
414                    references,
415                });
416            }
417        };
418
419        let references_query = match Query::new(&language, references_query_str) {
420            Ok(q) => q,
421            Err(_) => {
422                return Ok(Context {
423                    definitions,
424                    references,
425                });
426            }
427        };
428
429        let mut references_cursor = QueryCursor::new();
430        let mut ref_matches =
431            references_cursor.matches(&references_query, tree.root_node(), file_content.as_bytes());
432
433        while let Some(mat) = ref_matches.next() {
434            for cap in mat.captures {
435                let capture_name = &references_query.capture_names()[cap.index as usize];
436                if [
437                    "direct_call",
438                    "method_call",
439                    "macro_call",
440                    "reference",
441                    "callback",
442                    "import",
443                    "assignment",
444                ]
445                .contains(capture_name)
446                {
447                    let node = cap.node;
448                    let name = node.utf8_text(file_content.as_bytes())?.to_string();
449                    let start_byte = node.start_byte();
450                    let end_byte = node.end_byte();
451                    let source = node.utf8_text(file_content.as_bytes())?.to_string();
452                    let line_number = file_content[..start_byte].matches('\n').count() + 1;
453
454                    references.push(Definition {
455                        name,
456                        start_byte,
457                        end_byte,
458                        source,
459                        file_path: Some(start_path.to_path_buf()),
460                        line_number: Some(line_number),
461                    });
462                }
463            }
464        }
465
466        while let Some((file_path, func_name)) = to_visit.pop() {
467            if let Some((_, def)) = self.find_definition(&func_name, &file_path)? {
468                let refs = self.find_calls(&def.name)?;
469                for (ref_file, ref_def, _) in refs {
470                    if !collected.contains(&ref_def.name) {
471                        definitions.push(ref_def.clone());
472                        collected.insert(ref_def.name.clone());
473                        to_visit.push((ref_file, ref_def.name.clone()));
474                    }
475                }
476            }
477        }
478
479        Ok(Context {
480            definitions,
481            references,
482        })
483    }
484}
485
486impl Default for CodeParser {
487    fn default() -> Self {
488        Self {
489            files: HashMap::new(),
490            parser: Parser::new(),
491        }
492    }
493}