1use anyhow::{Context, Result};
2use ignore::WalkBuilder;
3use lazy_static::lazy_static;
4use regex::Regex;
5use serde::{Deserialize, Serialize};
6use std::collections::HashMap;
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::sync::{Arc, Mutex};
10use tree_sitter::{Language, Node, Parser, Query};
11
12struct AstNodeParams<'a> {
14 language: &'a str,
15 kind: &'a str,
16 line_num: usize,
17 line: &'a str,
18 capture: &'a str,
19 match_start: usize,
20 match_end: usize,
21}
22
23#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct CodeAST {
26 pub path: String,
27 pub language: String,
28 pub kind: String,
29 pub name: Option<String>,
30 pub range: Range,
31 pub children: Vec<CodeAST>,
32 pub content: Option<String>,
33}
34
35#[derive(Debug, Clone, Serialize, Deserialize)]
36pub struct Range {
37 pub start_row: usize,
38 pub start_column: usize,
39 pub end_row: usize,
40 pub end_column: usize,
41}
42
43lazy_static! {
44 static ref RUST_QUERY: &'static str = r#"
46 (struct_item name: (identifier) @struct.name) @struct.def
47 (enum_item name: (identifier) @enum.name) @enum.def
48 (trait_item name: (identifier) @trait.name) @trait.def
49 (impl_item type: (type_identifier) @impl.type) @impl.def
50 (function_item name: (identifier) @function.name) @function.def
51 (mod_item name: (identifier) @module.name) @module.def
52 "#;
53
54 static ref JAVASCRIPT_QUERY: &'static str = r#"
55 (class_declaration name: (identifier) @class.name) @class.def
56 (function_declaration name: (identifier) @function.name) @function.def
57 (method_definition name: (property_identifier) @method.name) @method.def
58 (lexical_declaration
59 (variable_declarator
60 name: (identifier) @const.name
61 value: (arrow_function) @const.value)) @const.def
62 "#;
63
64 static ref PYTHON_QUERY: &'static str = r#"
65 (class_definition name: (identifier) @class.name) @class.def
66 (function_definition name: (identifier) @function.name) @function.def
67 "#;
68
69 static ref GO_QUERY: &'static str = r#"
70 (type_declaration (type_spec name: (type_identifier) @type.name)) @type.def
71 (function_declaration name: (identifier) @function.name) @function.def
72 (method_declaration name: (field_identifier) @method.name) @method.def
73 (struct_type) @struct.def
74 (interface_type) @interface.def
75 "#;
76
77 static ref RUST_STRUCT_RE: Regex = Regex::new(r"struct\s+([A-Za-z0-9_]+)").unwrap();
79 static ref RUST_ENUM_RE: Regex = Regex::new(r"enum\s+([A-Za-z0-9_]+)").unwrap();
80 static ref RUST_IMPL_RE: Regex = Regex::new(r"impl(?:\s+<[^>]+>)?\s+([A-Za-z0-9_:]+)").unwrap();
81 static ref RUST_FN_RE: Regex = Regex::new(r"fn\s+([A-Za-z0-9_]+)").unwrap();
82 static ref RUST_TRAIT_RE: Regex = Regex::new(r"trait\s+([A-Za-z0-9_]+)").unwrap();
83 static ref RUST_MOD_RE: Regex = Regex::new(r"mod\s+([A-Za-z0-9_]+)").unwrap();
84
85 static ref JS_CLASS_RE: Regex = Regex::new(r"class\s+([A-Za-z0-9_]+)").unwrap();
86 static ref JS_FUNCTION_RE: Regex = Regex::new(r"function\s+([A-Za-z0-9_]+)").unwrap();
87 static ref JS_ARROW_FN_RE: Regex = Regex::new(r"const\s+([A-Za-z0-9_]+)\s*=\s*\([^)]*\)\s*=>").unwrap();
88 static ref JS_INTERFACE_RE: Regex = Regex::new(r"interface\s+([A-Za-z0-9_]+)").unwrap();
89 static ref JS_TYPE_RE: Regex = Regex::new(r"type\s+([A-Za-z0-9_]+)").unwrap();
90
91 static ref PY_CLASS_RE: Regex = Regex::new(r"class\s+([A-Za-z0-9_]+)").unwrap();
92 static ref PY_FUNCTION_RE: Regex = Regex::new(r"def\s+([A-Za-z0-9_]+)").unwrap();
93 static ref PY_ASYNC_FN_RE: Regex = Regex::new(r"async\s+def\s+([A-Za-z0-9_]+)").unwrap();
94
95 static ref GENERIC_BLOCK_RE: Regex = Regex::new(r"^\s*[{}]").unwrap();
96
97 static ref LANGUAGE_CACHE: Arc<Mutex<HashMap<String, Language>>> = Arc::new(Mutex::new(HashMap::new()));
99 static ref QUERY_CACHE: Arc<Mutex<HashMap<String, String>>> = Arc::new(Mutex::new(HashMap::new()));
100}
101
102pub struct CodeParser {
103 languages: HashMap<String, Vec<String>>,
104 parser: Parser,
105}
106
107impl CodeParser {
108 pub fn new() -> Result<Self> {
109 let mut languages = HashMap::new();
110
111 languages.insert("rust".to_string(), vec!["rs".to_string()]);
113 languages.insert(
114 "javascript".to_string(),
115 vec!["js".to_string(), "jsx".to_string()],
116 );
117 languages.insert(
118 "typescript".to_string(),
119 vec!["ts".to_string(), "tsx".to_string()],
120 );
121 languages.insert("python".to_string(), vec!["py".to_string()]);
122 languages.insert("go".to_string(), vec!["go".to_string()]);
123 languages.insert("c".to_string(), vec!["c".to_string(), "h".to_string()]);
124 languages.insert(
125 "cpp".to_string(),
126 vec![
127 "cpp".to_string(),
128 "cc".to_string(),
129 "cxx".to_string(),
130 "hpp".to_string(),
131 "hxx".to_string(),
132 ],
133 );
134 languages.insert("java".to_string(), vec!["java".to_string()]);
135
136 let parser = Parser::new();
138
139 {
141 let mut cache = LANGUAGE_CACHE.lock().unwrap();
142 if cache.is_empty() {
143 let rust_lang: Language = tree_sitter_rust::LANGUAGE.into();
145 cache.insert("rust".to_string(), rust_lang);
146
147 let js_lang: Language = tree_sitter_javascript::LANGUAGE.into();
148 cache.insert("javascript".to_string(), js_lang.clone());
149 cache.insert("typescript".to_string(), js_lang); let py_lang: Language = tree_sitter_python::LANGUAGE.into();
152 cache.insert("python".to_string(), py_lang);
153
154 let c_lang: Language = tree_sitter_c::LANGUAGE.into();
155 cache.insert("c".to_string(), c_lang);
156
157 let cpp_lang: Language = tree_sitter_cpp::LANGUAGE.into();
158 cache.insert("cpp".to_string(), cpp_lang);
159
160 let go_lang: Language = tree_sitter_go::LANGUAGE.into();
161 cache.insert("go".to_string(), go_lang);
162
163 let java_lang: Language = tree_sitter_java::LANGUAGE.into();
164 cache.insert("java".to_string(), java_lang);
165 }
166 }
167
168 {
170 let mut cache = QUERY_CACHE.lock().unwrap();
171 if cache.is_empty() {
172 cache.insert("rust".to_string(), RUST_QUERY.to_string());
173 cache.insert("javascript".to_string(), JAVASCRIPT_QUERY.to_string());
174 cache.insert("typescript".to_string(), JAVASCRIPT_QUERY.to_string());
175 cache.insert("python".to_string(), PYTHON_QUERY.to_string());
176 cache.insert("go".to_string(), GO_QUERY.to_string());
177 }
178 }
179
180 Ok(Self { languages, parser })
181 }
182
183 fn get_language(&self, language_name: &str) -> Option<Language> {
185 let cache = LANGUAGE_CACHE.lock().unwrap();
186 cache.get(language_name).cloned()
187 }
188
189 fn get_query(&self, language_name: &str) -> Option<Query> {
191 let query_cache = QUERY_CACHE.lock().unwrap();
192 if let Some(query_string) = query_cache.get(language_name) {
193 if let Some(lang) = self.get_language(language_name) {
194 return Query::new(&lang, query_string).ok();
195 }
196 }
197 None
198 }
199
200 pub fn detect_language(&self, path: &Path) -> Option<String> {
202 let extension = path.extension()?.to_str()?.to_lowercase();
203
204 if extension == "ts" || extension == "tsx" {
206 return Some("typescript".to_string());
207 } else if extension == "js" || extension == "jsx" {
208 return Some("javascript".to_string());
209 }
210
211 for (lang, extensions) in &self.languages {
213 if extensions.iter().any(|ext| ext == &extension) {
214 return Some(lang.clone());
215 }
216 }
217
218 None
219 }
220
221 pub fn parse_file(&mut self, path: &Path) -> Result<CodeAST> {
223 let language_name = self
225 .detect_language(path)
226 .context(format!("Could not detect language for file: {:?}", path))?;
227
228 let metadata = fs::metadata(path)?;
230
231 if metadata.len() > 1_000_000 {
233 return Ok(CodeAST {
234 path: path.to_string_lossy().to_string(),
235 language: language_name.to_string(),
236 kind: "file".to_string(),
237 name: path
238 .file_name()
239 .and_then(|n| n.to_str())
240 .map(|s| s.to_string()),
241 range: Range {
242 start_row: 0,
243 start_column: 0,
244 end_row: 0,
245 end_column: 0,
246 },
247 children: vec![CodeAST {
248 path: String::new(),
249 language: language_name.to_string(),
250 kind: "large_file".to_string(),
251 name: Some("File too large for AST generation".to_string()),
252 range: Range {
253 start_row: 0,
254 start_column: 0,
255 end_row: 0,
256 end_column: 0,
257 },
258 children: Vec::new(),
259 content: Some(format!(
260 "File size: {} bytes - too large for detailed parsing",
261 metadata.len()
262 )),
263 }],
264 content: None,
265 });
266 }
267
268 let source_code = fs::read_to_string(path)?;
270
271 let mut ast = CodeAST {
273 path: path.to_string_lossy().to_string(),
274 language: language_name.to_string(),
275 kind: "file".to_string(),
276 name: path
277 .file_name()
278 .and_then(|n| n.to_str())
279 .map(|s| s.to_string()),
280 range: Range {
281 start_row: 0,
282 start_column: 0,
283 end_row: source_code.lines().count(),
284 end_column: 0,
285 },
286 children: Vec::new(),
287 content: None,
288 };
289
290 if let Some(language) = self.get_language(&language_name) {
292 self.parser.set_language(&language)?;
294
295 if let Some(tree) = self.parser.parse(&source_code, None) {
297 if let Some(_query) = self.get_query(&language_name) {
299 let root_node = tree.root_node();
302 let root_type = root_node.kind();
303
304 let child_ast = CodeAST {
306 path: String::new(),
307 language: language_name.to_string(),
308 kind: "file_root".to_string(),
309 name: Some(root_type.to_string()),
310 range: Range {
311 start_row: root_node.start_position().row,
312 start_column: root_node.start_position().column,
313 end_row: root_node.end_position().row,
314 end_column: root_node.end_position().column,
315 },
316 children: Vec::new(),
317 content: Some(format!("Root node type: {}", root_type)),
318 };
319
320 ast.children.push(child_ast);
321 }
322
323 if !ast.children.is_empty() {
325 return Ok(ast);
326 }
327
328 let mut node_children =
331 self.extract_important_nodes(tree.root_node(), &source_code, &language_name);
332
333 if node_children.len() > 30 {
335 node_children.truncate(30);
336 }
337
338 if !node_children.is_empty() {
339 ast.children = node_children;
340 return Ok(ast);
341 }
342 }
343 }
344
345 self.create_simplified_ast(path, &language_name, &source_code)
348 }
349
350 fn extract_important_nodes(
352 &self,
353 node: Node<'_>,
354 source: &str,
355 language: &str,
356 ) -> Vec<CodeAST> {
357 let mut result = Vec::new();
358 let important_node_types = match language {
359 "rust" => &[
360 "struct_item",
361 "enum_item",
362 "impl_item",
363 "function_item",
364 "trait_item",
365 "mod_item",
366 "macro_definition",
367 ],
368 "javascript" | "typescript" => &[
369 "class_declaration",
370 "function_declaration",
371 "method_definition",
372 "lexical_declaration",
373 "interface_declaration",
374 "export_statement",
375 "variable_declaration", ],
377 "python" => &[
378 "class_definition",
379 "function_definition",
380 "decorated_definition",
381 "import_statement",
382 "assignment",
383 "expression_statement",
384 "return_statement", ],
386 "go" => &[
387 "function_declaration",
388 "method_declaration",
389 "type_declaration",
390 "struct_type",
391 "interface_type",
392 "package_clause",
393 "import_declaration", ],
395 "c" | "cpp" => &[
396 "function_definition",
397 "class_specifier",
398 "struct_specifier",
399 "enum_specifier",
400 "namespace_definition",
401 "template_declaration",
402 "declaration", ],
404 "java" => &[
405 "class_declaration",
406 "method_declaration",
407 "interface_declaration",
408 "constructor_declaration",
409 "field_declaration",
410 "import_declaration",
411 "package_declaration", ],
413 _ => &[
414 "unknown", "unknown", "unknown", "unknown", "unknown", "unknown", "unknown",
415 ], };
417
418 if important_node_types.contains(&node.kind()) {
420 self.process_important_node(node, source, language, &mut result);
421 }
422
423 let mut cursor = node.walk();
425 for child in node.children(&mut cursor) {
426 if child.child_count() > 0 && !child.is_named() {
428 let child_results = self.extract_important_nodes(child, source, language);
429 result.extend(child_results);
430 }
431 }
432
433 result
434 }
435
436 fn process_important_node(
438 &self,
439 node: Node<'_>,
440 source: &str,
441 language: &str,
442 result: &mut Vec<CodeAST>,
443 ) {
444 let mut name = None;
446 let mut cursor = node.walk();
447
448 for child in node.children(&mut cursor) {
450 if child.kind() == "identifier"
451 || child.kind() == "type_identifier"
452 || child.kind() == "field_identifier"
453 || child.kind() == "property_identifier"
454 {
455 if let Ok(text) = child.utf8_text(source.as_bytes()) {
456 name = Some(text.to_string());
457 break;
458 }
459 }
460 }
461
462 let content = node
464 .utf8_text(source.as_bytes())
465 .ok()
466 .and_then(|s| s.lines().next())
467 .map(|first_line| {
468 if first_line.len() > 100 {
470 format!("{}...", &first_line[..100])
471 } else {
472 first_line.to_string()
473 }
474 });
475
476 let ast_node = CodeAST {
478 path: String::new(),
479 language: language.to_string(),
480 kind: node.kind().to_string(),
481 name,
482 range: Range {
483 start_row: node.start_position().row,
484 start_column: 0, end_row: node.end_position().row,
486 end_column: 0, },
488 children: Vec::new(),
489 content,
490 };
491
492 result.push(ast_node);
493 }
494
495 pub fn create_simplified_ast(
497 &self,
498 path: &Path,
499 language: &str,
500 source_code: &str,
501 ) -> Result<CodeAST> {
502 let limited_source = if source_code.len() > 50_000 {
504 let truncated: String = source_code.chars().take(50_000).collect();
506 truncated
507 } else {
508 source_code.to_string()
509 };
510
511 let lines: Vec<&str> = limited_source.lines().collect();
512
513 let mut ast = CodeAST {
515 path: path.to_string_lossy().to_string(),
516 language: language.to_string(),
517 kind: "file".to_string(),
518 name: path
519 .file_name()
520 .and_then(|n| n.to_str())
521 .map(|s| s.to_string()),
522 range: Range {
523 start_row: 0,
524 start_column: 0,
525 end_row: lines.len(),
526 end_column: 0, },
528 children: Vec::new(),
529 content: None,
530 };
531
532 let mut children = match language {
534 "rust" => self.extract_rust_constructs(&limited_source),
535 "javascript" | "typescript" => self.extract_js_constructs(&limited_source),
536 "python" => self.extract_python_constructs(&limited_source),
537 _ => self.extract_generic_constructs(&limited_source),
538 };
539
540 if children.len() > 30 {
542 children.truncate(30);
543 }
544
545 ast.children = children;
546
547 Ok(ast)
548 }
549
550 fn create_ast_node(&self, params: AstNodeParams) -> CodeAST {
552 CodeAST {
553 path: String::new(), language: params.language.to_string(),
555 kind: params.kind.to_string(),
556 name: Some(params.capture.to_string()),
557 range: Range {
558 start_row: params.line_num,
559 start_column: params.match_start, end_row: params.line_num,
561 end_column: params.match_end, },
563 children: Vec::new(),
564 content: if params.line.len() > 100 {
566 Some(format!("{}...", ¶ms.line[..100]))
567 } else {
568 Some(params.line.to_string())
569 },
570 }
571 }
572
573 fn extract_rust_constructs(&self, source: &str) -> Vec<CodeAST> {
575 let mut constructs = Vec::new();
576 let lines: Vec<&str> = source.lines().collect();
577
578 for (line_num, line) in lines.iter().enumerate() {
580 if let Some(captures) = RUST_STRUCT_RE.captures(line) {
582 if let Some(name_match) = captures.get(1) {
583 constructs.push(self.create_ast_node(AstNodeParams {
584 language: "rust",
585 kind: "struct",
586 line_num,
587 line,
588 capture: name_match.as_str(),
589 match_start: name_match.start(),
590 match_end: name_match.end(),
591 }));
592 }
593 }
594
595 if let Some(captures) = RUST_ENUM_RE.captures(line) {
597 if let Some(name_match) = captures.get(1) {
598 constructs.push(self.create_ast_node(AstNodeParams {
599 language: "rust",
600 kind: "enum",
601 line_num,
602 line,
603 capture: name_match.as_str(),
604 match_start: name_match.start(),
605 match_end: name_match.end(),
606 }));
607 }
608 }
609
610 if let Some(captures) = RUST_IMPL_RE.captures(line) {
612 if let Some(name_match) = captures.get(1) {
613 constructs.push(self.create_ast_node(AstNodeParams {
614 language: "rust",
615 kind: "impl",
616 line_num,
617 line,
618 capture: name_match.as_str(),
619 match_start: name_match.start(),
620 match_end: name_match.end(),
621 }));
622 }
623 }
624
625 if let Some(captures) = RUST_FN_RE.captures(line) {
627 if let Some(name_match) = captures.get(1) {
628 constructs.push(self.create_ast_node(AstNodeParams {
629 language: "rust",
630 kind: "function",
631 line_num,
632 line,
633 capture: name_match.as_str(),
634 match_start: name_match.start(),
635 match_end: name_match.end(),
636 }));
637 }
638 }
639
640 if let Some(captures) = RUST_TRAIT_RE.captures(line) {
642 if let Some(name_match) = captures.get(1) {
643 constructs.push(self.create_ast_node(AstNodeParams {
644 language: "rust",
645 kind: "trait",
646 line_num,
647 line,
648 capture: name_match.as_str(),
649 match_start: name_match.start(),
650 match_end: name_match.end(),
651 }));
652 }
653 }
654
655 if let Some(captures) = RUST_MOD_RE.captures(line) {
657 if let Some(name_match) = captures.get(1) {
658 constructs.push(self.create_ast_node(AstNodeParams {
659 language: "rust",
660 kind: "module",
661 line_num,
662 line,
663 capture: name_match.as_str(),
664 match_start: name_match.start(),
665 match_end: name_match.end(),
666 }));
667 }
668 }
669 }
670
671 constructs
672 }
673
674 fn extract_js_constructs(&self, source: &str) -> Vec<CodeAST> {
676 let mut constructs = Vec::new();
677 let lines: Vec<&str> = source.lines().collect();
678
679 for (line_num, line) in lines.iter().enumerate() {
681 if let Some(captures) = JS_CLASS_RE.captures(line) {
683 if let Some(name_match) = captures.get(1) {
684 constructs.push(self.create_ast_node(AstNodeParams {
685 language: "javascript",
686 kind: "class",
687 line_num,
688 line,
689 capture: name_match.as_str(),
690 match_start: name_match.start(),
691 match_end: name_match.end(),
692 }));
693 }
694 }
695
696 if let Some(captures) = JS_FUNCTION_RE.captures(line) {
698 if let Some(name_match) = captures.get(1) {
699 constructs.push(self.create_ast_node(AstNodeParams {
700 language: "javascript",
701 kind: "function",
702 line_num,
703 line,
704 capture: name_match.as_str(),
705 match_start: name_match.start(),
706 match_end: name_match.end(),
707 }));
708 }
709 }
710
711 if let Some(captures) = JS_ARROW_FN_RE.captures(line) {
713 if let Some(name_match) = captures.get(1) {
714 constructs.push(self.create_ast_node(AstNodeParams {
715 language: "javascript",
716 kind: "arrow_function",
717 line_num,
718 line,
719 capture: name_match.as_str(),
720 match_start: name_match.start(),
721 match_end: name_match.end(),
722 }));
723 }
724 }
725
726 if let Some(captures) = JS_INTERFACE_RE.captures(line) {
728 if let Some(name_match) = captures.get(1) {
729 constructs.push(self.create_ast_node(AstNodeParams {
730 language: "javascript",
731 kind: "interface",
732 line_num,
733 line,
734 capture: name_match.as_str(),
735 match_start: name_match.start(),
736 match_end: name_match.end(),
737 }));
738 }
739 }
740
741 if let Some(captures) = JS_TYPE_RE.captures(line) {
743 if let Some(name_match) = captures.get(1) {
744 constructs.push(self.create_ast_node(AstNodeParams {
745 language: "javascript",
746 kind: "type",
747 line_num,
748 line,
749 capture: name_match.as_str(),
750 match_start: name_match.start(),
751 match_end: name_match.end(),
752 }));
753 }
754 }
755 }
756
757 constructs
758 }
759
760 fn extract_python_constructs(&self, source: &str) -> Vec<CodeAST> {
762 let mut constructs = Vec::new();
763 let lines: Vec<&str> = source.lines().collect();
764
765 for (line_num, line) in lines.iter().enumerate() {
767 if let Some(captures) = PY_CLASS_RE.captures(line) {
769 if let Some(name_match) = captures.get(1) {
770 constructs.push(self.create_ast_node(AstNodeParams {
771 language: "python",
772 kind: "class",
773 line_num,
774 line,
775 capture: name_match.as_str(),
776 match_start: name_match.start(),
777 match_end: name_match.end(),
778 }));
779 }
780 }
781
782 if let Some(captures) = PY_FUNCTION_RE.captures(line) {
784 if let Some(name_match) = captures.get(1) {
785 constructs.push(self.create_ast_node(AstNodeParams {
786 language: "python",
787 kind: "function",
788 line_num,
789 line,
790 capture: name_match.as_str(),
791 match_start: name_match.start(),
792 match_end: name_match.end(),
793 }));
794 }
795 }
796
797 if let Some(captures) = PY_ASYNC_FN_RE.captures(line) {
799 if let Some(name_match) = captures.get(1) {
800 constructs.push(self.create_ast_node(AstNodeParams {
801 language: "python",
802 kind: "async_function",
803 line_num,
804 line,
805 capture: name_match.as_str(),
806 match_start: name_match.start(),
807 match_end: name_match.end(),
808 }));
809 }
810 }
811 }
812
813 constructs
814 }
815
816 fn extract_generic_constructs(&self, source: &str) -> Vec<CodeAST> {
818 let mut constructs = Vec::new();
819 let lines: Vec<&str> = source.lines().collect();
820
821 for (line_num, line) in lines.iter().enumerate() {
823 if GENERIC_BLOCK_RE.is_match(line) {
824 constructs.push(CodeAST {
825 path: String::new(),
826 language: "generic".to_string(),
827 kind: "block".to_string(),
828 name: None,
829 range: Range {
830 start_row: line_num,
831 start_column: 0,
832 end_row: line_num,
833 end_column: line.len(),
834 },
835 children: Vec::new(),
836 content: Some(line.to_string()),
837 });
838 }
839 }
840
841 constructs
842 }
843
844 fn find_relevant_files(&self, root_dir: &Path, query: &str) -> Result<Vec<PathBuf>> {
846 use crate::tools::fs::search::SearchTools;
847
848 let mut results = Vec::new();
849
850 let max_files = 25; let filter_gitignore = |path: &Path| -> bool {
855 let walker = WalkBuilder::new(path)
857 .hidden(false) .git_ignore(true) .build();
860
861 walker.flatten().any(|entry| entry.path() == path)
863 };
864
865 let file_regex =
868 Regex::new(r"(?:file|in|check|view|read)\s+([a-zA-Z0-9_\-\.]+\.[a-zA-Z0-9]+)").unwrap();
869 let mut specific_files = Vec::new();
870
871 for cap in file_regex.captures_iter(query) {
872 if let Some(file_name) = cap.get(1) {
873 specific_files.push(format!("**/{}", file_name.as_str()));
874 }
875 }
876
877 if !specific_files.is_empty() {
879 for pattern in &specific_files {
880 if let Ok(matches) = SearchTools::glob_search(pattern) {
881 for path in matches {
882 if !results.contains(&path) && filter_gitignore(&path) {
883 results.push(path);
884 if results.len() >= max_files {
885 return Ok(results);
886 }
887 }
888 }
889 }
890 }
891 }
892
893 let search_terms = self.extract_search_terms(query);
895 if !search_terms.is_empty() {
896 let top_terms: Vec<String> = search_terms.into_iter().take(3).collect();
898
899 for term in top_terms {
900 if let Ok(grep_matches) = SearchTools::grep_search(&term, None, Some(root_dir)) {
901 for (path, _, _) in grep_matches.into_iter().take(5) {
903 if !results.contains(&path) && filter_gitignore(&path) {
904 results.push(path);
905 if results.len() >= max_files {
906 return Ok(results);
907 }
908 }
909 }
910 }
911 }
912 }
913
914 if results.len() < max_files {
916 let patterns = self.determine_relevant_files(query);
918 let targeted_patterns: Vec<&String> = patterns.iter().take(5).collect();
919
920 for pattern in targeted_patterns {
921 if let Ok(matches) = SearchTools::glob_search(pattern) {
922 for path in matches.into_iter().take(5) {
923 if !results.contains(&path) && filter_gitignore(&path) {
924 results.push(path);
925 if results.len() >= max_files {
926 return Ok(results);
927 }
928 }
929 }
930 }
931 }
932 }
933
934 if results.len() < 5 {
936 let key_project_files = vec![
937 "**/lib.rs",
938 "**/main.rs",
939 "**/mod.rs",
940 "**/Cargo.toml",
941 "**/package.json",
942 "**/README.md",
943 ];
944
945 for pattern in key_project_files {
946 if let Ok(matches) = SearchTools::glob_search(pattern) {
947 for path in matches {
948 if !results.contains(&path) && filter_gitignore(&path) {
949 results.push(path);
950 if results.len() >= max_files {
951 return Ok(results);
952 }
953 }
954 }
955 }
956 }
957 }
958
959 results.sort_by(|a, b| {
961 let a_modified = std::fs::metadata(a).and_then(|m| m.modified()).ok();
962 let b_modified = std::fs::metadata(b).and_then(|m| m.modified()).ok();
963 b_modified.cmp(&a_modified)
964 });
965
966 Ok(results)
967 }
968
969 pub fn extract_search_terms(&self, query: &str) -> Vec<String> {
971 let mut terms = Vec::new();
972
973 let words: Vec<&str> = query
975 .split_whitespace()
976 .filter(|w| w.len() > 3) .collect();
978
979 for word in words {
980 let clean_word = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '_');
982
983 if clean_word.len() > 3
985 && clean_word.chars().all(|c| c.is_alphanumeric() || c == '_')
986 && !clean_word.chars().all(|c| c.is_numeric())
987 {
988 let common_words = [
990 "this",
991 "that",
992 "from",
993 "what",
994 "when",
995 "where",
996 "which",
997 "find",
998 "function",
999 "class",
1000 "struct",
1001 "impl",
1002 "type",
1003 "interface",
1004 "const",
1005 "static",
1006 "public",
1007 "private",
1008 "protected",
1009 "export",
1010 "import",
1011 ];
1012
1013 if !common_words.contains(&clean_word.to_lowercase().as_str()) {
1014 terms.push(clean_word.to_string());
1015 }
1016 }
1017 }
1018
1019 terms
1020 }
1021
1022 pub fn parse_codebase(&mut self, root_dir: &Path, query: &str) -> Result<Vec<CodeAST>> {
1024 let mut asts = Vec::new();
1025
1026 let relevant_files = self.find_relevant_files(root_dir, query)?;
1028
1029 for path in relevant_files {
1031 if let Ok(ast) = self.parse_file(&path) {
1032 asts.push(ast);
1033 }
1034 }
1035
1036 Ok(asts)
1037 }
1038
1039 pub fn generate_llm_friendly_ast(&mut self, root_dir: &Path, query: &str) -> Result<String> {
1041 let mut asts = self.parse_codebase(root_dir, query)?;
1043
1044 if asts.is_empty() {
1046 return Ok(String::from("No relevant code structures found for the query. Try to be more specific about what code you're looking for."));
1047 }
1048
1049 asts.sort_by(|a, b| {
1051 let a_path = Path::new(&a.path);
1052 let b_path = Path::new(&b.path);
1053
1054 let a_modified = std::fs::metadata(a_path).and_then(|m| m.modified()).ok();
1055 let b_modified = std::fs::metadata(b_path).and_then(|m| m.modified()).ok();
1056
1057 b_modified.cmp(&a_modified)
1058 });
1059
1060 if asts.len() > 10 {
1062 asts.truncate(10);
1063 }
1064
1065 for ast in &mut asts {
1067 if ast.children.len() > 20 {
1069 ast.children.truncate(20);
1070 }
1071
1072 for child in &mut ast.children {
1074 if let Some(content) = &child.content {
1075 if content.len() > 500 {
1076 let truncated: String = content.chars().take(500).collect();
1077 child.content = Some(format!("{}... [truncated]", truncated));
1078 }
1079 }
1080 }
1081 }
1082
1083 let mut summary = String::new();
1085 summary.push_str(&format!(
1086 "# Code Structure Analysis for Query: \"{}\"\n\n",
1087 query
1088 ));
1089 summary.push_str(&format!(
1090 "Found {} relevant files (showing {} most relevant). Key structures:\n\n",
1091 asts.len(),
1092 asts.len()
1093 ));
1094
1095 for ast in &asts {
1097 summary.push_str(&format!("## File: {}\n", ast.path));
1098 summary.push_str(&format!("Language: {}\n\n", ast.language));
1099
1100 for child in &ast.children {
1101 let name = child.name.as_deref().unwrap_or("anonymous");
1102 summary.push_str(&format!(
1103 "- {} `{}` at line {}\n",
1104 child.kind,
1105 name,
1106 child.range.start_row + 1
1107 ));
1108
1109 if let Some(content) = &child.content {
1111 let first_line = content.lines().next().unwrap_or("");
1113 if !first_line.is_empty() {
1114 summary.push_str(&format!(
1115 " ```{}\n {}\n ```\n",
1116 ast.language, first_line
1117 ));
1118 }
1119 }
1120 }
1121
1122 summary.push('\n');
1123 }
1124
1125 let simplified_asts: Vec<serde_json::Value> = asts
1127 .iter()
1128 .map(|ast| {
1129 let simplified_children: Vec<serde_json::Value> = ast
1130 .children
1131 .iter()
1132 .map(|child| {
1133 serde_json::json!({
1134 "kind": child.kind,
1135 "name": child.name,
1136 "line": child.range.start_row + 1
1137 })
1138 })
1139 .collect();
1140
1141 serde_json::json!({
1142 "path": ast.path,
1143 "language": ast.language,
1144 "entities": simplified_children
1145 })
1146 })
1147 .collect();
1148
1149 summary.push_str("\n## Simplified Code Structure:\n\n```json\n");
1151 let simplified_json = serde_json::to_string_pretty(&simplified_asts)
1152 .context("Failed to serialize simplified AST to JSON")?;
1153 summary.push_str(&simplified_json);
1154 summary.push_str("\n```\n");
1155
1156 summary.push_str("\n## Full AST Data (JSON):\n\n```json\n");
1158 let full_json =
1159 serde_json::to_string_pretty(&asts).context("Failed to serialize full AST to JSON")?;
1160 summary.push_str(&full_json);
1161 summary.push_str("\n```\n");
1162
1163 Ok(summary)
1164 }
1165
1166 pub fn determine_relevant_files(&self, query: &str) -> Vec<String> {
1168 let mut patterns = Vec::new();
1169
1170 let file_regex = Regex::new(r#"['"]([^'"]+\.\w+)['"]"#).unwrap();
1172 for cap in file_regex.captures_iter(query) {
1173 if let Some(file_match) = cap.get(1) {
1174 let file_pattern = format!("**/{}", file_match.as_str());
1175 patterns.push(file_pattern);
1176 }
1177 }
1178
1179 let query_lower = query.to_lowercase();
1181
1182 if query_lower.contains("rust") || query_lower.contains(".rs") {
1184 patterns.push("**/*.rs".to_string());
1185 patterns.push("**/src/**/*.rs".to_string());
1186 patterns.push("**/lib.rs".to_string());
1187 patterns.push("**/main.rs".to_string());
1188 }
1189
1190 if query_lower.contains("javascript")
1192 || query_lower.contains("js")
1193 || query_lower.contains("node")
1194 || query_lower.contains("react")
1195 {
1196 patterns.push("**/*.js".to_string());
1197 patterns.push("**/*.jsx".to_string());
1198 patterns.push("**/src/**/*.js".to_string());
1199 patterns.push("**/src/**/*.jsx".to_string());
1200 }
1201
1202 if query_lower.contains("typescript")
1204 || query_lower.contains("ts")
1205 || query_lower.contains("angular")
1206 || query_lower.contains("next")
1207 {
1208 patterns.push("**/*.ts".to_string());
1209 patterns.push("**/*.tsx".to_string());
1210 patterns.push("**/src/**/*.ts".to_string());
1211 patterns.push("**/src/**/*.tsx".to_string());
1212 }
1213
1214 if query_lower.contains("python")
1216 || query_lower.contains("py")
1217 || query_lower.contains("django")
1218 || query_lower.contains("flask")
1219 {
1220 patterns.push("**/*.py".to_string());
1221 patterns.push("**/src/**/*.py".to_string());
1222 }
1223
1224 if query_lower.contains("go") || query_lower.contains("golang") {
1226 patterns.push("**/*.go".to_string());
1227 patterns.push("**/src/**/*.go".to_string());
1228 }
1229
1230 if query_lower.contains("c++")
1232 || query_lower.contains("cpp")
1233 || query_lower.contains(" c ")
1234 || query_lower.contains(".c")
1235 {
1236 patterns.push("**/*.c".to_string());
1237 patterns.push("**/*.h".to_string());
1238 patterns.push("**/*.cpp".to_string());
1239 patterns.push("**/*.hpp".to_string());
1240 patterns.push("**/*.cc".to_string());
1241 }
1242
1243 if query_lower.contains("java") && !query_lower.contains("javascript") {
1245 patterns.push("**/*.java".to_string());
1246 patterns.push("**/src/**/*.java".to_string());
1247 }
1248
1249 if patterns.is_empty() || !patterns.iter().any(|p| p.starts_with("**/src/")) {
1251 patterns.push("**/src/**/*.rs".to_string());
1252 patterns.push("**/src/**/*.ts".to_string());
1253 patterns.push("**/src/**/*.js".to_string());
1254 patterns.push("**/src/**/*.py".to_string());
1255 }
1256
1257 if !patterns.iter().any(|p| p.ends_with(".rs")) {
1259 patterns.push("**/*.rs".to_string());
1260 }
1261
1262 patterns
1263 }
1264}