qex_core/chunk/languages/
mod.rs1pub mod c;
2pub mod cpp;
3pub mod csharp;
4pub mod go;
5pub mod java;
6pub mod javascript;
7pub mod markdown;
8pub mod python;
9pub mod rust_lang;
10pub mod typescript;
11
12use crate::chunk::ChunkType;
13
14#[derive(Debug, Clone, Default)]
16pub struct NodeMetadata {
17 pub name: Option<String>,
18 pub docstring: Option<String>,
19 pub decorators: Vec<String>,
20 pub is_async: bool,
21 pub is_generator: bool,
22 pub receiver_type: Option<String>,
23}
24
25pub trait LanguageChunker: Send + Sync {
27 fn tree_sitter_language(&self) -> tree_sitter::Language;
29
30 fn language_name(&self) -> &str;
32
33 fn file_extensions(&self) -> &[&str];
35
36 fn is_splittable(&self, node_type: &str) -> bool;
38
39 fn has_nested_chunks(&self, node_type: &str) -> bool;
41
42 fn classify_node(&self, node_type: &str, parent_name: Option<&str>) -> ChunkType;
44
45 fn extract_metadata(&self, node: tree_sitter::Node, source: &str) -> NodeMetadata;
47}
48
49pub fn find_child_text<'a>(
51 node: tree_sitter::Node<'a>,
52 source: &'a str,
53 child_type: &str,
54) -> Option<String> {
55 let mut cursor = node.walk();
56 for child in node.children(&mut cursor) {
57 if child.kind() == child_type {
58 let text = &source[child.start_byte()..child.end_byte()];
59 return Some(text.to_string());
60 }
61 }
62 None
63}
64
65pub fn find_name(node: tree_sitter::Node, source: &str) -> Option<String> {
67 find_child_text(node, source, "identifier")
68 .or_else(|| find_child_text(node, source, "type_identifier"))
69 .or_else(|| find_child_text(node, source, "property_identifier"))
70}
71
72pub fn extract_docstring_from_body(node: tree_sitter::Node, source: &str) -> Option<String> {
74 let mut cursor = node.walk();
76 for child in node.children(&mut cursor) {
77 let kind = child.kind();
78 if kind == "block" || kind == "body" || kind == "class_body" || kind == "declaration_list" {
79 let first_stmt = child.child(0);
80 if let Some(first_stmt) = first_stmt {
81 if first_stmt.kind() == "expression_statement" {
82 let expr = first_stmt.child(0);
83 if let Some(expr) = expr {
84 if expr.kind() == "string" || expr.kind() == "string_literal" {
85 let text = &source[expr.start_byte()..expr.end_byte()];
86 return Some(text.trim_matches('"').trim_matches('\'').to_string());
87 }
88 }
89 }
90 }
91 }
92 if kind == "comment" || kind == "line_comment" || kind == "block_comment" {
94 let text = &source[child.start_byte()..child.end_byte()];
95 return Some(text.to_string());
96 }
97 }
98 None
99}
100
101pub fn extract_preceding_comments(node: tree_sitter::Node, source: &str) -> Option<String> {
103 let mut comments = Vec::new();
104 let mut sibling = node.prev_sibling();
105
106 while let Some(sib) = sibling {
107 let kind = sib.kind();
108 if kind == "comment" || kind == "line_comment" || kind == "block_comment" {
109 let text = &source[sib.start_byte()..sib.end_byte()];
110 comments.push(text.to_string());
111 sibling = sib.prev_sibling();
112 } else {
113 break;
114 }
115 }
116
117 if comments.is_empty() {
118 None
119 } else {
120 comments.reverse();
121 Some(comments.join("\n"))
122 }
123}
124
125pub fn all_chunkers() -> Vec<Box<dyn LanguageChunker>> {
127 vec![
128 Box::new(python::PythonChunker),
129 Box::new(javascript::JavaScriptChunker),
130 Box::new(typescript::TypeScriptChunker),
131 Box::new(typescript::TsxChunker),
132 Box::new(rust_lang::RustChunker),
133 Box::new(go::GoChunker),
134 Box::new(java::JavaChunker),
135 Box::new(c::CChunker),
136 Box::new(cpp::CppChunker),
137 Box::new(csharp::CSharpChunker),
138 Box::new(markdown::MarkdownChunker),
139 ]
140}