Skip to main content

qex_core/chunk/languages/
mod.rs

1pub mod c;
2pub mod cpp;
3pub mod csharp;
4pub mod go;
5pub mod java;
6pub mod javascript;
7pub mod markdown;
8pub mod python;
9pub mod rust_lang;
10pub mod typescript;
11
12use crate::chunk::ChunkType;
13
14/// Metadata extracted from a tree-sitter node
15#[derive(Debug, Clone, Default)]
16pub struct NodeMetadata {
17    pub name: Option<String>,
18    pub docstring: Option<String>,
19    pub decorators: Vec<String>,
20    pub is_async: bool,
21    pub is_generator: bool,
22    pub receiver_type: Option<String>,
23}
24
25/// Trait for language-specific code chunking behavior
26pub trait LanguageChunker: Send + Sync {
27    /// Return the tree-sitter Language for this chunker
28    fn tree_sitter_language(&self) -> tree_sitter::Language;
29
30    /// Language name identifier
31    fn language_name(&self) -> &str;
32
33    /// File extensions this chunker handles
34    fn file_extensions(&self) -> &[&str];
35
36    /// Whether this node type should be extracted as a chunk
37    fn is_splittable(&self, node_type: &str) -> bool;
38
39    /// Whether this node type can contain nested splittable nodes (e.g., class body)
40    fn has_nested_chunks(&self, node_type: &str) -> bool;
41
42    /// Classify a node type into a ChunkType
43    fn classify_node(&self, node_type: &str, parent_name: Option<&str>) -> ChunkType;
44
45    /// Extract language-specific metadata from a node
46    fn extract_metadata(&self, node: tree_sitter::Node, source: &str) -> NodeMetadata;
47}
48
49/// Helper to find the first named child of a given type and extract its text
50pub fn find_child_text<'a>(
51    node: tree_sitter::Node<'a>,
52    source: &'a str,
53    child_type: &str,
54) -> Option<String> {
55    let mut cursor = node.walk();
56    for child in node.children(&mut cursor) {
57        if child.kind() == child_type {
58            let text = &source[child.start_byte()..child.end_byte()];
59            return Some(text.to_string());
60        }
61    }
62    None
63}
64
65/// Helper to find identifier name from a node
66pub fn find_name(node: tree_sitter::Node, source: &str) -> Option<String> {
67    find_child_text(node, source, "identifier")
68        .or_else(|| find_child_text(node, source, "type_identifier"))
69        .or_else(|| find_child_text(node, source, "property_identifier"))
70}
71
72/// Extract a docstring from the node (looks for preceding comment or first string child)
73pub fn extract_docstring_from_body(node: tree_sitter::Node, source: &str) -> Option<String> {
74    // Look for a comment or string_content in the first child of the body
75    let mut cursor = node.walk();
76    for child in node.children(&mut cursor) {
77        let kind = child.kind();
78        if kind == "block" || kind == "body" || kind == "class_body" || kind == "declaration_list" {
79            let first_stmt = child.child(0);
80            if let Some(first_stmt) = first_stmt {
81                if first_stmt.kind() == "expression_statement" {
82                    let expr = first_stmt.child(0);
83                    if let Some(expr) = expr {
84                        if expr.kind() == "string" || expr.kind() == "string_literal" {
85                            let text = &source[expr.start_byte()..expr.end_byte()];
86                            return Some(text.trim_matches('"').trim_matches('\'').to_string());
87                        }
88                    }
89                }
90            }
91        }
92        // Check for doc comments preceding the node
93        if kind == "comment" || kind == "line_comment" || kind == "block_comment" {
94            let text = &source[child.start_byte()..child.end_byte()];
95            return Some(text.to_string());
96        }
97    }
98    None
99}
100
101/// Extract preceding doc comments for a node
102pub fn extract_preceding_comments(node: tree_sitter::Node, source: &str) -> Option<String> {
103    let mut comments = Vec::new();
104    let mut sibling = node.prev_sibling();
105
106    while let Some(sib) = sibling {
107        let kind = sib.kind();
108        if kind == "comment" || kind == "line_comment" || kind == "block_comment" {
109            let text = &source[sib.start_byte()..sib.end_byte()];
110            comments.push(text.to_string());
111            sibling = sib.prev_sibling();
112        } else {
113            break;
114        }
115    }
116
117    if comments.is_empty() {
118        None
119    } else {
120        comments.reverse();
121        Some(comments.join("\n"))
122    }
123}
124
125/// Get all supported language chunkers
126pub fn all_chunkers() -> Vec<Box<dyn LanguageChunker>> {
127    vec![
128        Box::new(python::PythonChunker),
129        Box::new(javascript::JavaScriptChunker),
130        Box::new(typescript::TypeScriptChunker),
131        Box::new(typescript::TsxChunker),
132        Box::new(rust_lang::RustChunker),
133        Box::new(go::GoChunker),
134        Box::new(java::JavaChunker),
135        Box::new(c::CChunker),
136        Box::new(cpp::CppChunker),
137        Box::new(csharp::CSharpChunker),
138        Box::new(markdown::MarkdownChunker),
139    ]
140}