ck_chunk/
lib.rs

1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Chunk {
7    pub span: Span,
8    pub text: String,
9    pub chunk_type: ChunkType,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub enum ChunkType {
14    Text,
15    Function,
16    Class,
17    Method,
18    Module,
19}
20
21pub fn chunk_text(text: &str, language: Option<&str>) -> Result<Vec<Chunk>> {
22    tracing::debug!("Chunking text with language: {:?}, length: {} chars", language, text.len());
23    
24    let result = match language {
25        Some("python") => {
26            tracing::debug!("Using Python tree-sitter parser");
27            chunk_python(text)
28        },
29        Some("typescript") | Some("javascript") => {
30            tracing::debug!("Using TypeScript/JavaScript tree-sitter parser");
31            chunk_typescript(text)
32        },
33        _ => {
34            tracing::debug!("Using generic chunking strategy");
35            chunk_generic(text)
36        },
37    };
38    
39    match &result {
40        Ok(chunks) => tracing::debug!("Successfully created {} chunks", chunks.len()),
41        Err(e) => tracing::warn!("Chunking failed: {}", e),
42    }
43    
44    result
45}
46
47fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
48    let mut chunks = Vec::new();
49    let lines: Vec<&str> = text.lines().collect();
50    let chunk_size = 20;
51    let overlap = 5;
52    
53    let mut i = 0;
54    while i < lines.len() {
55        let end = (i + chunk_size).min(lines.len());
56        let chunk_lines = &lines[i..end];
57        let chunk_text = chunk_lines.join("\n");
58        
59        let byte_start = lines[0..i].iter().map(|l| l.len() + 1).sum::<usize>();
60        let byte_end = byte_start + chunk_text.len();
61        
62        chunks.push(Chunk {
63            span: Span {
64                byte_start,
65                byte_end,
66                line_start: i + 1,
67                line_end: end,
68            },
69            text: chunk_text,
70            chunk_type: ChunkType::Text,
71        });
72        
73        i += chunk_size - overlap;
74        if i >= lines.len() {
75            break;
76        }
77    }
78    
79    Ok(chunks)
80}
81
82fn chunk_python(text: &str) -> Result<Vec<Chunk>> {
83    let mut parser = tree_sitter::Parser::new();
84    parser.set_language(tree_sitter_python::language())?;
85    
86    let tree = parser.parse(text, None).ok_or_else(|| {
87        anyhow::anyhow!("Failed to parse Python code")
88    })?;
89    
90    let mut chunks = Vec::new();
91    let mut cursor = tree.root_node().walk();
92    
93    extract_code_chunks(&mut cursor, text, &mut chunks, "python");
94    
95    if chunks.is_empty() {
96        return chunk_generic(text);
97    }
98    
99    Ok(chunks)
100}
101
102fn chunk_typescript(text: &str) -> Result<Vec<Chunk>> {
103    let mut parser = tree_sitter::Parser::new();
104    parser.set_language(tree_sitter_typescript::language_typescript())?;
105    
106    let tree = parser.parse(text, None).ok_or_else(|| {
107        anyhow::anyhow!("Failed to parse TypeScript code")
108    })?;
109    
110    let mut chunks = Vec::new();
111    let mut cursor = tree.root_node().walk();
112    
113    extract_code_chunks(&mut cursor, text, &mut chunks, "typescript");
114    
115    if chunks.is_empty() {
116        return chunk_generic(text);
117    }
118    
119    Ok(chunks)
120}
121
122fn extract_code_chunks(
123    cursor: &mut tree_sitter::TreeCursor,
124    source: &str,
125    chunks: &mut Vec<Chunk>,
126    language: &str,
127) {
128    let node = cursor.node();
129    let node_kind = node.kind();
130    
131    let is_chunk = match language {
132        "python" => matches!(node_kind, "function_definition" | "class_definition"),
133        "typescript" | "javascript" => matches!(
134            node_kind,
135            "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
136        ),
137        _ => false,
138    };
139    
140    if is_chunk {
141        let start_byte = node.start_byte();
142        let end_byte = node.end_byte();
143        let start_pos = node.start_position();
144        let end_pos = node.end_position();
145        
146        let text = &source[start_byte..end_byte];
147        
148        let chunk_type = match node_kind {
149            "function_definition" | "function_declaration" | "arrow_function" => ChunkType::Function,
150            "class_definition" | "class_declaration" => ChunkType::Class,
151            "method_definition" => ChunkType::Method,
152            _ => ChunkType::Text,
153        };
154        
155        chunks.push(Chunk {
156            span: Span {
157                byte_start: start_byte,
158                byte_end: end_byte,
159                line_start: start_pos.row + 1,
160                line_end: end_pos.row + 1,
161            },
162            text: text.to_string(),
163            chunk_type,
164        });
165    }
166    
167    if cursor.goto_first_child() {
168        loop {
169            extract_code_chunks(cursor, source, chunks, language);
170            if !cursor.goto_next_sibling() {
171                break;
172            }
173        }
174        cursor.goto_parent();
175    }
176}