1use anyhow::{Context, Result};
2use ignore::WalkBuilder;
3use lazy_static::lazy_static;
4use rayon::prelude::*;
5use serde::{Deserialize, Serialize};
6use std::collections::{HashMap, HashSet};
7use std::fs;
8use std::path::{Path, PathBuf};
9use std::sync::{Arc, Mutex, RwLock};
10use tree_sitter::{Language, Node, Parser, Query, QueryCursor, StreamingIterator, Tree};
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
14pub struct CodeAST {
15 pub path: String,
16 pub language: String,
17 pub kind: String,
18 pub name: Option<String>,
19 pub range: Range,
20 pub children: Vec<CodeAST>,
21 pub content: Option<String>,
22}
23
24#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct Range {
27 pub start_row: usize,
28 pub start_column: usize,
29 pub end_row: usize,
30 pub end_column: usize,
31}
32
33lazy_static! {
35 static ref RUST_QUERY: &'static str = r#"
37 ; Struct declarations
38 (struct_item
39 name: (identifier) @struct.name
40 body: (field_declaration_list)? @struct.body) @struct.def
41
42 ; Enum declarations
43 (enum_item
44 name: (identifier) @enum.name
45 body: (enum_variant_list)? @enum.body) @enum.def
46
47 ; Trait declarations
48 (trait_item
49 name: (identifier) @trait.name
50 body: (declaration_list)? @trait.body) @trait.def
51
52 ; Implementations
53 (impl_item
54 trait: (type_identifier)? @impl.trait
55 type: (type_identifier) @impl.type
56 body: (declaration_list)? @impl.body) @impl.def
57
58 ; Functions
59 (function_item
60 name: (identifier) @function.name
61 parameters: (parameters)? @function.params
62 body: (block)? @function.body) @function.def
63
64 ; Modules
65 (mod_item
66 name: (identifier) @module.name
67 body: (declaration_list)? @module.body) @module.def
68
69 ; Constants and statics
70 (const_item
71 name: (identifier) @const.name
72 type: (_) @const.type
73 value: (_) @const.value) @const.def
74
75 (static_item
76 name: (identifier) @static.name
77 type: (_) @static.type
78 value: (_) @static.value) @static.def
79 "#;
80
81 static ref JAVASCRIPT_QUERY: &'static str = r#"
83 ; Classes
84 (class_declaration
85 name: (identifier) @class.name
86 body: (class_body)? @class.body) @class.def
87
88 ; Functions
89 (function_declaration
90 name: (identifier) @function.name
91 parameters: (formal_parameters) @function.params
92 body: (statement_block)? @function.body) @function.def
93
94 ; Methods
95 (method_definition
96 name: (property_identifier) @method.name
97 parameters: (formal_parameters) @method.params
98 body: (statement_block)? @method.body) @method.def
99
100 ; Arrow functions in variable declarations
101 (lexical_declaration
102 (variable_declarator
103 name: (identifier) @const.name
104 value: (arrow_function) @const.value)) @const.def
105
106 ; Object pattern in variable declarations
107 (variable_declaration
108 (variable_declarator
109 name: (identifier) @var.name)) @var.def
110
111 ; Interface declarations (TypeScript)
112 (interface_declaration
113 name: (type_identifier) @interface.name
114 body: (object_type)? @interface.body) @interface.def
115
116 ; Type aliases (TypeScript)
117 (type_alias_declaration
118 name: (type_identifier) @type.name
119 value: (_) @type.value) @type.def
120
121 ; Export declarations
122 (export_statement
123 declaration: (_) @export.declaration) @export.def
124 "#;
125
126 static ref PYTHON_QUERY: &'static str = r#"
128 ; Classes
129 (class_definition
130 name: (identifier) @class.name
131 body: (block)? @class.body) @class.def
132
133 ; Functions
134 (function_definition
135 name: (identifier) @function.name
136 parameters: (parameters) @function.params
137 body: (block)? @function.body) @function.def
138
139 ; Decorated definitions
140 (decorated_definition
141 definition: (_) @decorated.definition) @decorated.def
142
143 ; Imports
144 (import_statement
145 name: (dotted_name) @import.name) @import.def
146
147 (import_from_statement
148 module_name: (dotted_name) @import_from.module) @import_from.def
149
150 ; Global variables and constants
151 (assignment
152 left: (identifier) @assignment.name
153 right: (_) @assignment.value) @assignment.def
154
155 ; Class attributes
156 (class_definition
157 body: (block
158 (expression_statement
159 (assignment
160 left: (identifier) @class_attr.name)))) @class_attr.def
161 "#;
162
163 static ref GO_QUERY: &'static str = r#"
165 ; Type declarations
166 (type_declaration
167 (type_spec
168 name: (type_identifier) @type.name
169 type: (_) @type.value)) @type.def
170
171 ; Function declarations
172 (function_declaration
173 name: (identifier) @function.name
174 parameters: (parameter_list) @function.params
175 result: (_)? @function.result
176 body: (block)? @function.body) @function.def
177
178 ; Method declarations
179 (method_declaration
180 name: (field_identifier) @method.name
181 parameters: (parameter_list) @method.params
182 result: (_)? @method.result
183 body: (block)? @method.body) @method.def
184
185 ; Struct type definitions
186 (type_declaration
187 (type_spec
188 name: (type_identifier) @struct.name
189 type: (struct_type) @struct.body)) @struct.def
190
191 ; Interface type definitions
192 (type_declaration
193 (type_spec
194 name: (type_identifier) @interface.name
195 type: (interface_type) @interface.body)) @interface.def
196
197 ; Package clause
198 (package_clause
199 (package_identifier) @package.name) @package.def
200
201 ; Import declarations
202 (import_declaration
203 (import_spec_list) @import.specs) @import.def
204 "#;
205
206 static ref CPP_QUERY: &'static str = r#"
208 ; Function definitions
209 (function_definition
210 declarator: (function_declarator
211 declarator: (identifier) @function.name
212 parameters: (parameter_list) @function.params)
213 body: (compound_statement) @function.body) @function.def
214
215 ; Class specifiers
216 (class_specifier
217 name: (type_identifier) @class.name
218 body: (field_declaration_list) @class.body) @class.def
219
220 ; Struct specifiers
221 (struct_specifier
222 name: (type_identifier) @struct.name
223 body: (field_declaration_list) @struct.body) @struct.def
224
225 ; Enum specifiers
226 (enum_specifier
227 name: (type_identifier) @enum.name
228 body: (enumerator_list) @enum.body) @enum.def
229
230 ; Namespace definitions
231 (namespace_definition
232 name: (identifier) @namespace.name
233 body: (declaration_list) @namespace.body) @namespace.def
234
235 ; Template declarations
236 (template_declaration
237 parameters: (template_parameter_list) @template.params
238 declaration: (_) @template.declaration) @template.def
239
240 ; Variable declarations
241 (declaration
242 declarator: (init_declarator
243 declarator: (identifier) @var.name)) @var.def
244
245 ; Method definitions
246 (function_definition
247 declarator: (function_declarator
248 declarator: (field_identifier) @method.name
249 parameters: (parameter_list) @method.params)
250 body: (compound_statement) @method.body) @method.def
251 "#;
252
253 static ref JAVA_QUERY: &'static str = r#"
255 ; Class declarations
256 (class_declaration
257 name: (identifier) @class.name
258 body: (class_body) @class.body) @class.def
259
260 ; Method declarations
261 (method_declaration
262 name: (identifier) @method.name
263 parameters: (formal_parameters) @method.params
264 body: (block)? @method.body) @method.def
265
266 ; Interface declarations
267 (interface_declaration
268 name: (identifier) @interface.name
269 body: (interface_body) @interface.body) @interface.def
270
271 ; Constructor declarations
272 (constructor_declaration
273 name: (identifier) @constructor.name
274 parameters: (formal_parameters) @constructor.params
275 body: (constructor_body) @constructor.body) @constructor.def
276
277 ; Field declarations
278 (field_declaration
279 declarator: (variable_declarator
280 name: (identifier) @field.name)) @field.def
281
282 ; Package declarations
283 (package_declaration
284 name: (scoped_identifier) @package.name) @package.def
285
286 ; Import declarations
287 (import_declaration
288 name: (scoped_identifier) @import.name) @import.def
289
290 ; Annotation declarations
291 (annotation_type_declaration
292 name: (identifier) @annotation.name
293 body: (annotation_type_body) @annotation.body) @annotation.def
294 "#;
295
296 static ref LANGUAGE_CACHE: Arc<RwLock<HashMap<String, Language>>> = Arc::new(RwLock::new(HashMap::new()));
298 static ref PARSER_CACHE: Arc<Mutex<HashMap<String, Parser>>> = Arc::new(Mutex::new(HashMap::new()));
299 static ref QUERY_CACHE: Arc<RwLock<HashMap<String, String>>> = Arc::new(RwLock::new(HashMap::new()));
300 static ref TREE_CACHE: Arc<RwLock<HashMap<PathBuf, (Tree, String)>>> = Arc::new(RwLock::new(HashMap::new()));
301}
302
303pub struct CodeParser {
315 languages: HashMap<String, Vec<String>>,
317 parser: Parser,
319 cache_size_limit: usize,
321 max_file_size: usize,
323 max_files: usize,
325 max_depth: usize,
327}
328
329impl CodeParser {
330 pub fn new() -> Result<Self> {
336 Self::with_config(None, None, None, None)
337 }
338
339 pub fn with_config(
350 cache_size_limit: Option<usize>,
351 max_file_size: Option<usize>,
352 max_files: Option<usize>,
353 max_depth: Option<usize>,
354 ) -> Result<Self> {
355 let mut languages = HashMap::new();
356
357 languages.insert("rust".to_string(), vec!["rs".to_string()]);
359 languages.insert(
360 "javascript".to_string(),
361 vec!["js".to_string(), "jsx".to_string()],
362 );
363 languages.insert(
364 "typescript".to_string(),
365 vec!["ts".to_string(), "tsx".to_string()],
366 );
367 languages.insert("python".to_string(), vec!["py".to_string()]);
368 languages.insert("go".to_string(), vec!["go".to_string()]);
369 languages.insert("c".to_string(), vec!["c".to_string(), "h".to_string()]);
370 languages.insert(
371 "cpp".to_string(),
372 vec![
373 "cpp".to_string(),
374 "cc".to_string(),
375 "cxx".to_string(),
376 "hpp".to_string(),
377 "hxx".to_string(),
378 ],
379 );
380 languages.insert("java".to_string(), vec!["java".to_string()]);
381
382 let parser = Parser::new();
384
385 {
387 let mut cache = LANGUAGE_CACHE.write().unwrap();
388 if cache.is_empty() {
389 let rust_lang: Language = tree_sitter_rust::LANGUAGE.into();
391 cache.insert("rust".to_string(), rust_lang);
392
393 let js_lang: Language = tree_sitter_javascript::LANGUAGE.into();
394 cache.insert("javascript".to_string(), js_lang.clone());
395 cache.insert("typescript".to_string(), js_lang); let py_lang: Language = tree_sitter_python::LANGUAGE.into();
398 cache.insert("python".to_string(), py_lang);
399
400 let c_lang: Language = tree_sitter_c::LANGUAGE.into();
401 cache.insert("c".to_string(), c_lang);
402
403 let cpp_lang: Language = tree_sitter_cpp::LANGUAGE.into();
404 cache.insert("cpp".to_string(), cpp_lang);
405
406 let go_lang: Language = tree_sitter_go::LANGUAGE.into();
407 cache.insert("go".to_string(), go_lang);
408
409 let java_lang: Language = tree_sitter_java::LANGUAGE.into();
410 cache.insert("java".to_string(), java_lang);
411 }
412 }
413
414 {
416 let mut cache = PARSER_CACHE.lock().unwrap();
417 if cache.is_empty() {
418 for lang_name in languages.keys() {
419 let mut new_parser = Parser::new();
420 if let Some(lang) = LANGUAGE_CACHE.read().unwrap().get(lang_name) {
421 if new_parser.set_language(lang).is_ok() {
422 cache.insert(lang_name.clone(), new_parser);
423 }
424 }
425 }
426 }
427 }
428
429 {
431 let mut cache = QUERY_CACHE.write().unwrap();
432 if cache.is_empty() {
433 cache.insert("rust".to_string(), RUST_QUERY.to_string());
434 cache.insert("javascript".to_string(), JAVASCRIPT_QUERY.to_string());
435 cache.insert("typescript".to_string(), JAVASCRIPT_QUERY.to_string());
436 cache.insert("python".to_string(), PYTHON_QUERY.to_string());
437 cache.insert("go".to_string(), GO_QUERY.to_string());
438 cache.insert("c".to_string(), CPP_QUERY.to_string());
439 cache.insert("cpp".to_string(), CPP_QUERY.to_string());
440 cache.insert("java".to_string(), JAVA_QUERY.to_string());
441 }
442 }
443
444 let cache_size_limit = cache_size_limit.unwrap_or(50 * 1024 * 1024); let max_file_size = max_file_size.unwrap_or(1_000_000); let max_files = max_files.unwrap_or(25); let max_depth = max_depth.unwrap_or(3); Ok(Self {
451 languages,
452 parser,
453 cache_size_limit,
454 max_file_size,
455 max_files,
456 max_depth,
457 })
458 }
459
460 fn get_language(&self, language_name: &str) -> Option<Language> {
468 LANGUAGE_CACHE.read().unwrap().get(language_name).cloned()
469 }
470
471 fn get_query(&self, language_name: &str) -> Option<Result<Query>> {
481 let query_cache = QUERY_CACHE.read().unwrap();
482 let query_string = query_cache.get(language_name)?;
483
484 if let Some(lang) = self.get_language(language_name) {
485 match Query::new(&lang, query_string) {
486 Ok(query) => Some(Ok(query)),
487 Err(e) => Some(Err(anyhow::anyhow!("Failed to create query: {:?}", e))),
488 }
489 } else {
490 None
491 }
492 }
493
494 pub fn detect_language(&self, path: &Path) -> Option<String> {
502 let extension = path.extension()?.to_str()?.to_lowercase();
503
504 if extension == "ts" || extension == "tsx" {
506 return Some("typescript".to_string());
507 } else if extension == "js" || extension == "jsx" {
508 return Some("javascript".to_string());
509 }
510
511 for (lang, extensions) in &self.languages {
513 if extensions.iter().any(|ext| ext == &extension) {
514 return Some(lang.clone());
515 }
516 }
517
518 None
519 }
520
521 pub fn parse_file(&mut self, path: &Path) -> Result<CodeAST> {
529 let language_name = self
531 .detect_language(path)
532 .context(format!("Could not detect language for file: {:?}", path))?;
533
534 let metadata = fs::metadata(path)?;
536
537 if metadata.len() > self.max_file_size as u64 {
539 return Ok(CodeAST {
540 path: path.to_string_lossy().to_string(),
541 language: language_name.to_string(),
542 kind: "file".to_string(),
543 name: path
544 .file_name()
545 .and_then(|n| n.to_str())
546 .map(|s| s.to_string()),
547 range: Range {
548 start_row: 0,
549 start_column: 0,
550 end_row: 0,
551 end_column: 0,
552 },
553 children: vec![CodeAST {
554 path: String::new(),
555 language: language_name.to_string(),
556 kind: "large_file".to_string(),
557 name: Some("File too large for AST generation".to_string()),
558 range: Range {
559 start_row: 0,
560 start_column: 0,
561 end_row: 0,
562 end_column: 0,
563 },
564 children: Vec::new(),
565 content: Some(format!(
566 "File size: {} bytes - too large for detailed parsing",
567 metadata.len()
568 )),
569 }],
570 content: None,
571 });
572 }
573
574 let source_code = fs::read_to_string(path)?;
576
577 let mut ast = CodeAST {
579 path: path.to_string_lossy().to_string(),
580 language: language_name.to_string(),
581 kind: "file".to_string(),
582 name: path
583 .file_name()
584 .and_then(|n| n.to_str())
585 .map(|s| s.to_string()),
586 range: Range {
587 start_row: 0,
588 start_column: 0,
589 end_row: source_code.lines().count(),
590 end_column: 0,
591 },
592 children: Vec::new(),
593 content: None,
594 };
595
596 if let Some(language) = self.get_language(&language_name) {
598 let path_buf = path.to_path_buf();
600 let tree_option = {
601 let cache = TREE_CACHE.read().unwrap();
602 if let Some((tree, content)) = cache.get(&path_buf) {
603 if content == &source_code {
604 Some(tree.clone())
605 } else {
606 None
607 }
608 } else {
609 None
610 }
611 };
612
613 let tree = if let Some(cached_tree) = tree_option {
615 cached_tree
616 } else {
617 self.parser.set_language(&language)?;
619
620 let tree = self
622 .parser
623 .parse(&source_code, None)
624 .context("Failed to parse source code")?;
625
626 {
628 let mut cache = TREE_CACHE.write().unwrap();
629
630 let current_size: usize =
632 cache.iter().map(|(_, (_, content))| content.len()).sum();
633
634 if current_size + source_code.len() > self.cache_size_limit {
635 let mut keys_to_remove = Vec::new();
637 let mut entries: Vec<_> = cache.iter().collect();
638 entries.sort_by_key(|(_, (_, content))| content.len());
639
640 let mut freed_size = 0;
641 let needed_size = source_code.len();
642
643 for (path, (_, content)) in entries {
644 if current_size + needed_size - freed_size <= self.cache_size_limit {
645 break;
646 }
647
648 freed_size += content.len();
649 keys_to_remove.push(path.clone());
650 }
651
652 for path in keys_to_remove {
654 cache.remove(&path);
655 }
656 }
657
658 cache.insert(path_buf.clone(), (tree.clone(), source_code.clone()));
659 }
660
661 tree
662 };
663
664 if let Some(Ok(query)) = self.get_query(&language_name) {
666 let root_node = tree.root_node();
668 let mut query_cursor = QueryCursor::new();
669
670 let mut matches = query_cursor.matches(&query, root_node, source_code.as_bytes());
672
673 while let Some(match_item) = matches.next() {
675 let mut node_data: HashMap<String, (Node, String)> = HashMap::new();
676
677 for capture in match_item.captures {
679 let capture_name = &query.capture_names()[capture.index as usize];
681 let node_text = capture
682 .node
683 .utf8_text(source_code.as_bytes())
684 .unwrap_or("<unknown>");
685
686 node_data.insert(
687 capture_name.to_string(),
688 (capture.node, node_text.to_string()),
689 );
690 }
691
692 let def_entries: Vec<_> = node_data
694 .iter()
695 .filter(|(name, _)| name.ends_with(".def"))
696 .collect();
697
698 if !def_entries.is_empty() {
699 let (def_name, (def_node, _)) = def_entries[0];
700 let def_type = def_name.split('.').next().unwrap_or("unknown");
701 let start_pos = def_node.start_position();
702 let end_pos = def_node.end_position();
703
704 let name_entries: Vec<_> = node_data
706 .iter()
707 .filter(|(name, _)| name.ends_with(".name"))
708 .collect();
709
710 let name = if !name_entries.is_empty() {
711 let (_, (_, name_text)) = name_entries[0];
712 Some(name_text.clone())
713 } else {
714 None
715 };
716
717 let body_entries: Vec<_> = node_data
719 .iter()
720 .filter(|(name, _)| name.ends_with(".body"))
721 .collect();
722
723 let content = if !body_entries.is_empty() {
724 let (_, (body_node, _)) = body_entries[0];
725 body_node
726 .utf8_text(source_code.as_bytes())
727 .ok()
728 .map(|s| s.to_string())
729 } else {
730 def_node
731 .utf8_text(source_code.as_bytes())
732 .ok()
733 .map(|s| s.to_string())
734 };
735
736 let mut child_ast = CodeAST {
738 path: String::new(),
739 language: language_name.to_string(),
740 kind: def_type.to_string(),
741 name,
742 range: Range {
743 start_row: start_pos.row,
744 start_column: start_pos.column,
745 end_row: end_pos.row,
746 end_column: end_pos.column,
747 },
748 children: Vec::new(),
749 content: content.map(|s| {
750 if s.len() > 1000 {
752 format!("{}...", &s[..1000])
753 } else {
754 s
755 }
756 }),
757 };
758
759 self.extract_nested_structures(
761 &source_code,
762 *def_node,
763 &mut child_ast,
764 &language_name,
765 self.max_depth, );
767
768 ast.children.push(child_ast);
769 }
770 }
771
772 if !ast.children.is_empty() {
774 return Ok(ast);
775 }
776 }
777
778 let tree_node = tree.root_node();
780 let node_children =
781 self.extract_important_nodes(tree_node, &source_code, &language_name);
782
783 if !node_children.is_empty() {
784 ast.children = node_children;
785 return Ok(ast);
786 }
787 }
788
789 self.create_simplified_ast(path, &language_name, &source_code)
791 }
792
793 fn extract_nested_structures(
802 &self,
803 source_code: &str,
804 node: Node,
805 parent_ast: &mut CodeAST,
806 language: &str,
807 depth: usize,
808 ) {
809 if depth == 0 {
810 return;
811 }
812
813 if node.end_byte() - node.start_byte() < 10 {
815 return;
816 }
817
818 let mut cursor = node.walk();
819
820 let important_node_types = Self::get_important_node_types(language);
822
823 for child in node.children(&mut cursor) {
825 let kind = child.kind();
826
827 if kind == "(" || kind == ")" || kind == "{" || kind == "}" || kind == ";" {
829 continue;
830 }
831
832 if important_node_types.contains(&kind) {
834 let start_pos = child.start_position();
835 let end_pos = child.end_position();
836
837 let name = self.extract_node_name(&child, source_code);
839
840 let content = child.utf8_text(source_code.as_bytes()).ok().map(|s| {
842 if s.len() > 500 {
844 format!("{}...", &s[..500])
845 } else {
846 s.to_string()
847 }
848 });
849
850 let mut child_ast = CodeAST {
852 path: String::new(),
853 language: language.to_string(),
854 kind: kind.to_string(),
855 name,
856 range: Range {
857 start_row: start_pos.row,
858 start_column: start_pos.column,
859 end_row: end_pos.row,
860 end_column: end_pos.column,
861 },
862 children: Vec::new(),
863 content,
864 };
865
866 self.extract_nested_structures(
868 source_code,
869 child,
870 &mut child_ast,
871 language,
872 depth - 1,
873 );
874
875 parent_ast.children.push(child_ast);
876 }
877 }
878 }
879
880 fn extract_node_name(&self, node: &Node, source: &str) -> Option<String> {
889 let mut cursor = node.walk();
890
891 for child in node.children(&mut cursor) {
893 if child.kind() == "identifier"
894 || child.kind() == "type_identifier"
895 || child.kind() == "field_identifier"
896 || child.kind() == "property_identifier"
897 {
898 if let Ok(text) = child.utf8_text(source.as_bytes()) {
899 return Some(text.to_string());
900 }
901 }
902 }
903
904 None
905 }
906
907 fn get_important_node_types(language: &str) -> &'static [&'static str] {
915 match language {
916 "rust" => &[
917 "struct_item",
918 "enum_item",
919 "impl_item",
920 "function_item",
921 "trait_item",
922 "mod_item",
923 "macro_definition",
924 "const_item",
925 "static_item",
926 ],
927 "javascript" | "typescript" => &[
928 "class_declaration",
929 "function_declaration",
930 "method_definition",
931 "lexical_declaration",
932 "interface_declaration",
933 "type_alias_declaration",
934 "export_statement",
935 "variable_declaration",
936 ],
937 "python" => &[
938 "class_definition",
939 "function_definition",
940 "decorated_definition",
941 "import_statement",
942 "import_from_statement",
943 "assignment",
944 ],
945 "go" => &[
946 "function_declaration",
947 "method_declaration",
948 "type_declaration",
949 "struct_type",
950 "interface_type",
951 "package_clause",
952 "import_declaration",
953 ],
954 "c" | "cpp" => &[
955 "function_definition",
956 "class_specifier",
957 "struct_specifier",
958 "enum_specifier",
959 "namespace_definition",
960 "template_declaration",
961 "declaration",
962 ],
963 "java" => &[
964 "class_declaration",
965 "method_declaration",
966 "interface_declaration",
967 "constructor_declaration",
968 "field_declaration",
969 "package_declaration",
970 "import_declaration",
971 "annotation_type_declaration",
972 ],
973 _ => &[],
974 }
975 }
976
977 fn extract_important_nodes(
987 &self,
988 node: Node<'_>,
989 source: &str,
990 language: &str,
991 ) -> Vec<CodeAST> {
992 let mut result = Vec::new();
993 let important_node_types = Self::get_important_node_types(language);
994
995 if important_node_types.contains(&node.kind()) {
997 self.process_important_node(node, source, language, &mut result);
998 }
999
1000 let mut cursor = node.walk();
1002 for child in node.children(&mut cursor) {
1003 if child.child_count() > 0 && child.is_named() {
1005 let child_results = self.extract_important_nodes(child, source, language);
1006 result.extend(child_results);
1007 }
1008 }
1009
1010 result
1011 }
1012
1013 fn process_important_node(
1021 &self,
1022 node: Node<'_>,
1023 source: &str,
1024 language: &str,
1025 result: &mut Vec<CodeAST>,
1026 ) {
1027 let name = self.extract_node_name(&node, source);
1029
1030 let content = node.utf8_text(source.as_bytes()).ok().map(|s| {
1032 if s.len() > 500 {
1034 format!("{}...", &s[..500])
1035 } else {
1036 s.to_string()
1037 }
1038 });
1039
1040 let ast_node = CodeAST {
1042 path: String::new(),
1043 language: language.to_string(),
1044 kind: node.kind().to_string(),
1045 name,
1046 range: Range {
1047 start_row: node.start_position().row,
1048 start_column: node.start_position().column,
1049 end_row: node.end_position().row,
1050 end_column: node.end_position().column,
1051 },
1052 children: Vec::new(),
1053 content,
1054 };
1055
1056 result.push(ast_node);
1057 }
1058
1059 pub fn create_simplified_ast(
1070 &self,
1071 path: &Path,
1072 language: &str,
1073 source_code: &str,
1074 ) -> Result<CodeAST> {
1075 let limited_source = if source_code.len() > 50_000 {
1077 let truncated: String = source_code.chars().take(50_000).collect();
1079 truncated
1080 } else {
1081 source_code.to_string()
1082 };
1083
1084 let lines: Vec<&str> = limited_source.lines().collect();
1085
1086 let mut ast = CodeAST {
1088 path: path.to_string_lossy().to_string(),
1089 language: language.to_string(),
1090 kind: "file".to_string(),
1091 name: path
1092 .file_name()
1093 .and_then(|n| n.to_str())
1094 .map(|s| s.to_string()),
1095 range: Range {
1096 start_row: 0,
1097 start_column: 0,
1098 end_row: lines.len(),
1099 end_column: 0,
1100 },
1101 children: Vec::new(),
1102 content: None,
1103 };
1104
1105 for (line_num, line) in lines.iter().enumerate() {
1107 let trimmed = line.trim();
1108
1109 if trimmed.is_empty() || (trimmed.len() < 5 && !trimmed.contains('{')) {
1111 continue;
1112 }
1113
1114 if trimmed.contains(" fn ")
1116 || trimmed.contains("func ")
1117 || trimmed.contains(" class ")
1118 || trimmed.contains(" struct ")
1119 || trimmed.contains(" trait ")
1120 || trimmed.contains(" impl ")
1121 || trimmed.contains(" interface ")
1122 || trimmed.contains(" def ")
1123 || trimmed.contains(" type ")
1124 || trimmed.starts_with("fn ")
1125 || trimmed.starts_with("class ")
1126 || trimmed.starts_with("struct ")
1127 || trimmed.starts_with("trait ")
1128 || trimmed.starts_with("impl ")
1129 || trimmed.starts_with("interface ")
1130 || trimmed.starts_with("def ")
1131 || trimmed.starts_with("type ")
1132 || trimmed.starts_with("function ")
1133 || trimmed.starts_with("async ")
1134 {
1135 let kind = if trimmed.contains(" fn ")
1137 || trimmed.contains("func ")
1138 || trimmed.starts_with("fn ")
1139 || trimmed.contains(" def ")
1140 || trimmed.starts_with("def ")
1141 || trimmed.starts_with("function ")
1142 || trimmed.contains("async ")
1143 {
1144 "function"
1145 } else if trimmed.contains(" class ") || trimmed.starts_with("class ") {
1146 "class"
1147 } else if trimmed.contains(" struct ") || trimmed.starts_with("struct ") {
1148 "struct"
1149 } else if trimmed.contains(" trait ") || trimmed.starts_with("trait ") {
1150 "trait"
1151 } else if trimmed.contains(" impl ") || trimmed.starts_with("impl ") {
1152 "impl"
1153 } else if trimmed.contains(" interface ") || trimmed.starts_with("interface ") {
1154 "interface"
1155 } else if trimmed.contains(" type ") || trimmed.starts_with("type ") {
1156 "type"
1157 } else {
1158 "block"
1159 };
1160
1161 let words: Vec<&str> = trimmed.split_whitespace().collect();
1163 let mut name = None;
1164
1165 if words.len() > 1 {
1167 let name_word_idx = match kind {
1168 "function" => {
1169 if trimmed.contains(" fn ") {
1170 words.iter().position(|&w| w == "fn").map(|p| p + 1)
1171 } else if trimmed.contains(" def ") {
1172 words.iter().position(|&w| w == "def").map(|p| p + 1)
1173 } else if trimmed.contains("func ") {
1174 words.iter().position(|&w| w == "func").map(|p| p + 1)
1175 } else if trimmed.contains(" function ") {
1176 words.iter().position(|&w| w == "function").map(|p| p + 1)
1177 } else {
1178 Some(1) }
1180 }
1181 "class" => words.iter().position(|&w| w == "class").map(|p| p + 1),
1182 "struct" => words.iter().position(|&w| w == "struct").map(|p| p + 1),
1183 "trait" => words.iter().position(|&w| w == "trait").map(|p| p + 1),
1184 "impl" => words.iter().position(|&w| w == "impl").map(|p| p + 1),
1185 "interface" => words.iter().position(|&w| w == "interface").map(|p| p + 1),
1186 "type" => words.iter().position(|&w| w == "type").map(|p| p + 1),
1187 _ => Some(1),
1188 };
1189
1190 if let Some(idx) = name_word_idx {
1191 if idx < words.len() {
1192 name = Some(
1194 words[idx]
1195 .trim_end_matches(|c| ",:;<>(){}".contains(c))
1196 .to_string(),
1197 );
1198 }
1199 }
1200 }
1201
1202 let ast_node = CodeAST {
1204 path: String::new(),
1205 language: language.to_string(),
1206 kind: kind.to_string(),
1207 name,
1208 range: Range {
1209 start_row: line_num,
1210 start_column: 0,
1211 end_row: line_num,
1212 end_column: line.len(),
1213 },
1214 children: Vec::new(),
1215 content: Some(line.to_string()),
1216 };
1217
1218 ast.children.push(ast_node);
1219 }
1220 }
1221
1222 if ast.children.len() > 30 {
1224 ast.children.truncate(30);
1225 }
1226
1227 Ok(ast)
1228 }
1229
1230 fn find_relevant_files(&self, root_dir: &Path, query: &str) -> Result<Vec<PathBuf>> {
1239 use crate::tools::fs::search::SearchTools;
1240
1241 let mut results = Vec::new();
1242
1243 let max_files = self.max_files;
1245
1246 let filter_gitignore = |path: &Path| -> bool {
1248 let walker = WalkBuilder::new(path)
1250 .hidden(false) .git_ignore(true) .build();
1253
1254 walker.flatten().any(|entry| entry.path() == path)
1256 };
1257
1258 let file_regex =
1261 regex::Regex::new(r"(?:file|in|check|view|read)\s+([a-zA-Z0-9_\-\.]+\.[a-zA-Z0-9]+)")
1262 .unwrap();
1263 let mut specific_files = Vec::new();
1264
1265 for cap in file_regex.captures_iter(query) {
1266 if let Some(file_name) = cap.get(1) {
1267 specific_files.push(format!("**/{}", file_name.as_str()));
1268 }
1269 }
1270
1271 if !specific_files.is_empty() {
1273 for pattern in &specific_files {
1274 if let Ok(matches) = SearchTools::glob_search(pattern) {
1275 for path in matches {
1276 if !results.contains(&path) && filter_gitignore(&path) {
1277 results.push(path);
1278 if results.len() >= max_files {
1279 return Ok(results);
1280 }
1281 }
1282 }
1283 }
1284 }
1285 }
1286
1287 let search_terms = self.extract_search_terms(query);
1289 if !search_terms.is_empty() {
1290 let top_terms: Vec<String> = search_terms.into_iter().take(3).collect();
1292
1293 for term in top_terms {
1294 if let Ok(grep_matches) = SearchTools::grep_search(&term, None, Some(root_dir)) {
1295 for (path, _, _) in grep_matches.into_iter().take(5) {
1297 if !results.contains(&path) && filter_gitignore(&path) {
1298 results.push(path);
1299 if results.len() >= max_files {
1300 return Ok(results);
1301 }
1302 }
1303 }
1304 }
1305 }
1306 }
1307
1308 if results.len() < max_files {
1310 let patterns = self.determine_relevant_files(query);
1312 let targeted_patterns: Vec<&String> = patterns.iter().take(5).collect();
1313
1314 for pattern in targeted_patterns {
1315 if let Ok(matches) = SearchTools::glob_search(pattern) {
1316 for path in matches.into_iter().take(5) {
1317 if !results.contains(&path) && filter_gitignore(&path) {
1318 results.push(path);
1319 if results.len() >= max_files {
1320 return Ok(results);
1321 }
1322 }
1323 }
1324 }
1325 }
1326 }
1327
1328 if results.len() < 5 {
1330 let key_project_files = vec![
1331 "**/lib.rs",
1332 "**/main.rs",
1333 "**/mod.rs",
1334 "**/Cargo.toml",
1335 "**/package.json",
1336 "**/README.md",
1337 ];
1338
1339 for pattern in key_project_files {
1340 if let Ok(matches) = SearchTools::glob_search(pattern) {
1341 for path in matches {
1342 if !results.contains(&path) && filter_gitignore(&path) {
1343 results.push(path);
1344 if results.len() >= max_files {
1345 return Ok(results);
1346 }
1347 }
1348 }
1349 }
1350 }
1351 }
1352
1353 results.sort_by(|a, b| {
1355 let a_modified = std::fs::metadata(a).and_then(|m| m.modified()).ok();
1356 let b_modified = std::fs::metadata(b).and_then(|m| m.modified()).ok();
1357 b_modified.cmp(&a_modified)
1358 });
1359
1360 Ok(results)
1361 }
1362
1363 pub fn extract_search_terms(&self, query: &str) -> Vec<String> {
1371 let mut terms = Vec::new();
1372
1373 let words: Vec<&str> = query
1375 .split_whitespace()
1376 .filter(|w| w.len() > 3) .collect();
1378
1379 for word in words {
1380 let clean_word = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '_');
1382
1383 if clean_word.len() > 3
1385 && clean_word.chars().all(|c| c.is_alphanumeric() || c == '_')
1386 && !clean_word.chars().all(|c| c.is_numeric())
1387 {
1388 let common_words = [
1390 "this",
1391 "that",
1392 "from",
1393 "what",
1394 "when",
1395 "where",
1396 "which",
1397 "find",
1398 "function",
1399 "class",
1400 "struct",
1401 "impl",
1402 "type",
1403 "interface",
1404 "const",
1405 "static",
1406 "public",
1407 "private",
1408 "protected",
1409 "export",
1410 "import",
1411 ];
1412
1413 if !common_words.contains(&clean_word.to_lowercase().as_str()) {
1414 terms.push(clean_word.to_string());
1415 }
1416 }
1417 }
1418
1419 terms
1420 }
1421
1422 pub fn parse_codebase(&mut self, root_dir: &Path, query: &str) -> Result<Vec<CodeAST>> {
1431 let relevant_files = self.find_relevant_files(root_dir, query)?;
1433
1434 let asts: Vec<Result<CodeAST>> = relevant_files
1436 .par_iter()
1437 .map(|path| {
1438 let mut local_parser = CodeParser::new()?;
1439 local_parser.parse_file(path)
1440 })
1441 .collect();
1442
1443 let valid_asts: Vec<CodeAST> = asts
1445 .into_iter()
1446 .filter_map(|ast_result| {
1447 ast_result.ok()
1450 })
1451 .collect();
1452
1453 Ok(valid_asts)
1454 }
1455
1456 pub fn generate_llm_friendly_ast(&mut self, root_dir: &Path, query: &str) -> Result<String> {
1465 let mut asts = if root_dir.is_file() {
1467 let ast = self.parse_file(root_dir)?;
1469 vec![ast]
1470 } else {
1471 self.parse_codebase(root_dir, query)?
1473 };
1474
1475 if asts.is_empty() {
1477 return Ok(String::from("No relevant code structures found for the query. Try to be more specific about what code you're looking for."));
1478 }
1479
1480 asts.sort_by(|a, b| {
1482 let a_path = Path::new(&a.path);
1483 let b_path = Path::new(&b.path);
1484
1485 let a_modified = std::fs::metadata(a_path).and_then(|m| m.modified()).ok();
1486 let b_modified = std::fs::metadata(b_path).and_then(|m| m.modified()).ok();
1487
1488 b_modified.cmp(&a_modified)
1489 });
1490
1491 if asts.len() > 10 {
1493 asts.truncate(10);
1494 }
1495
1496 let mut structured_output = String::new();
1498 structured_output.push_str(&format!(
1499 "# Code Structure Analysis for Query: \"{}\"
1500
1501",
1502 query
1503 ));
1504
1505 structured_output.push_str(&format!(
1507 "## Codebase Structure Overview
1508
1509{} relevant files found. Showing hierarchical breakdown:
1510
1511",
1512 asts.len()
1513 ));
1514
1515 for ast in &asts {
1517 structured_output.push_str(&format!("### File: {}\n", ast.path));
1519 structured_output.push_str(&format!("Language: {}\n\n", ast.language));
1520
1521 let mut ordered_children = ast.children.clone();
1523 ordered_children.sort_by_key(|child| child.range.start_row);
1524
1525 let mut seen_types = HashSet::new();
1527
1528 for child in &ordered_children {
1530 let name = child.name.as_deref().unwrap_or("anonymous");
1531
1532 let type_name_key = format!("{}:{}", child.kind, name);
1534 if seen_types.contains(&type_name_key) {
1535 continue;
1536 }
1537 seen_types.insert(type_name_key);
1538
1539 structured_output.push_str(&format!(
1540 "- {} `{}` (line {})\n",
1541 child.kind,
1542 name,
1543 child.range.start_row + 1
1544 ));
1545
1546 if let Some(content) = &child.content {
1548 let preview = content.lines().next().unwrap_or("");
1550 if !preview.is_empty() {
1551 structured_output
1552 .push_str(&format!(" ```{}\n {}\n ```\n", ast.language, preview));
1553 }
1554 }
1555
1556 if !child.children.is_empty() {
1558 for nested_child in &child.children {
1559 if let Some(nested_name) = &nested_child.name {
1560 structured_output.push_str(&format!(
1561 " - {} `{}` (line {})\n",
1562 nested_child.kind,
1563 nested_name,
1564 nested_child.range.start_row + 1
1565 ));
1566 }
1567 }
1568 }
1569 }
1570
1571 structured_output.push('\n');
1572 }
1573
1574 structured_output.push_str("## Symbol Table\n\n");
1576 structured_output.push_str("| Type | Name | File | Line |\n");
1577 structured_output.push_str("|------|------|------|------|\n");
1578
1579 let mut all_symbols = Vec::new();
1581 for ast in &asts {
1582 for child in &ast.children {
1583 if let Some(name) = &child.name {
1584 if name == "anonymous" || name.is_empty() {
1586 continue;
1587 }
1588
1589 all_symbols.push((
1590 child.kind.clone(),
1591 name.clone(),
1592 ast.path.clone(),
1593 child.range.start_row + 1,
1594 ));
1595 }
1596 }
1597 }
1598
1599 all_symbols.sort_by(|a, b| {
1601 let type_cmp = a.0.cmp(&b.0);
1602 if type_cmp == std::cmp::Ordering::Equal {
1603 a.1.cmp(&b.1)
1604 } else {
1605 type_cmp
1606 }
1607 });
1608
1609 for (kind, name, file, line) in all_symbols {
1611 let file_name = Path::new(&file)
1613 .file_name()
1614 .and_then(|n| n.to_str())
1615 .unwrap_or("unknown");
1616
1617 structured_output.push_str(&format!(
1618 "| {} | `{}` | {} | {} |\n",
1619 kind, name, file_name, line
1620 ));
1621 }
1622
1623 structured_output.push_str("\n## Symbol Relationships\n\n");
1625 structured_output.push_str("This section shows relationships between code elements:\n\n");
1626
1627 let mut relationships = Vec::new();
1629
1630 for ast in &asts {
1631 if ast.language == "rust" {
1633 for child in &ast.children {
1634 if child.kind == "impl" {
1635 if let Some(name) = &child.name {
1636 relationships.push(format!(
1637 "- `{}` implements trait/functionality for type `{}`",
1638 ast.path, name
1639 ));
1640 }
1641 }
1642 }
1643 }
1644
1645 }
1648
1649 if !relationships.is_empty() {
1650 for relationship in relationships {
1651 structured_output.push_str(&format!("{}\n", relationship));
1652 }
1653 } else {
1654 structured_output.push_str("No clear relationships detected between symbols.\n");
1655 }
1656
1657 structured_output.push_str("\n## AST Summary\n\n");
1660 structured_output.push_str(&format!(
1662 "Analyzed {} files containing {} total code structures.\n",
1663 asts.len(),
1664 asts.iter().map(|ast| ast.children.len()).sum::<usize>()
1665 ));
1666
1667 Ok(structured_output)
1668 }
1669
1670 pub fn determine_relevant_files(&self, query: &str) -> Vec<String> {
1678 let mut patterns = Vec::new();
1679
1680 let file_regex = regex::Regex::new(r#"['"](\S+\.\w+)['"]"#).unwrap();
1682 for cap in file_regex.captures_iter(query) {
1683 if let Some(file_match) = cap.get(1) {
1684 let file_pattern = format!("**/{}", file_match.as_str());
1685 patterns.push(file_pattern);
1686 }
1687 }
1688
1689 let query_lower = query.to_lowercase();
1691
1692 if query_lower.contains("rust") || query_lower.contains(".rs") {
1694 patterns.push("**/*.rs".to_string());
1695 patterns.push("**/src/**/*.rs".to_string());
1696 patterns.push("**/lib.rs".to_string());
1697 patterns.push("**/main.rs".to_string());
1698 }
1699
1700 if query_lower.contains("javascript")
1702 || query_lower.contains("js")
1703 || query_lower.contains("node")
1704 || query_lower.contains("react")
1705 {
1706 patterns.push("**/*.js".to_string());
1707 patterns.push("**/*.jsx".to_string());
1708 patterns.push("**/src/**/*.js".to_string());
1709 patterns.push("**/src/**/*.jsx".to_string());
1710 }
1711
1712 if query_lower.contains("typescript")
1714 || query_lower.contains("ts")
1715 || query_lower.contains("angular")
1716 || query_lower.contains("next")
1717 {
1718 patterns.push("**/*.ts".to_string());
1719 patterns.push("**/*.tsx".to_string());
1720 patterns.push("**/src/**/*.ts".to_string());
1721 patterns.push("**/src/**/*.tsx".to_string());
1722 }
1723
1724 if query_lower.contains("python")
1726 || query_lower.contains("py")
1727 || query_lower.contains("django")
1728 || query_lower.contains("flask")
1729 {
1730 patterns.push("**/*.py".to_string());
1731 patterns.push("**/src/**/*.py".to_string());
1732 }
1733
1734 if query_lower.contains("go") || query_lower.contains("golang") {
1736 patterns.push("**/*.go".to_string());
1737 patterns.push("**/src/**/*.go".to_string());
1738 }
1739
1740 if query_lower.contains("c++")
1742 || query_lower.contains("cpp")
1743 || query_lower.contains(" c ")
1744 || query_lower.contains(".c")
1745 {
1746 patterns.push("**/*.c".to_string());
1747 patterns.push("**/*.h".to_string());
1748 patterns.push("**/*.cpp".to_string());
1749 patterns.push("**/*.hpp".to_string());
1750 patterns.push("**/*.cc".to_string());
1751 }
1752
1753 if query_lower.contains("java") && !query_lower.contains("javascript") {
1755 patterns.push("**/*.java".to_string());
1756 patterns.push("**/src/**/*.java".to_string());
1757 }
1758
1759 if patterns.is_empty() || !patterns.iter().any(|p| p.starts_with("**/src/")) {
1761 patterns.push("**/src/**/*.rs".to_string());
1762 patterns.push("**/src/**/*.ts".to_string());
1763 patterns.push("**/src/**/*.js".to_string());
1764 patterns.push("**/src/**/*.py".to_string());
1765 }
1766
1767 if !patterns.iter().any(|p| p.ends_with(".rs")) {
1769 patterns.push("**/*.rs".to_string());
1770 }
1771
1772 patterns
1773 }
1774}