1use anyhow::Result;
2use ck_core::Span;
3use serde::{Deserialize, Serialize};
4
5#[derive(Debug, Clone, Serialize, Deserialize)]
6pub struct Chunk {
7 pub span: Span,
8 pub text: String,
9 pub chunk_type: ChunkType,
10}
11
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub enum ChunkType {
14 Text,
15 Function,
16 Class,
17 Method,
18 Module,
19}
20
21pub fn chunk_text(text: &str, language: Option<&str>) -> Result<Vec<Chunk>> {
22 tracing::debug!("Chunking text with language: {:?}, length: {} chars", language, text.len());
23
24 let result = match language {
25 Some("python") => {
26 tracing::debug!("Using Python tree-sitter parser");
27 chunk_python(text)
28 },
29 Some("typescript") | Some("javascript") => {
30 tracing::debug!("Using TypeScript/JavaScript tree-sitter parser");
31 chunk_typescript(text)
32 },
33 _ => {
34 tracing::debug!("Using generic chunking strategy");
35 chunk_generic(text)
36 },
37 };
38
39 match &result {
40 Ok(chunks) => tracing::debug!("Successfully created {} chunks", chunks.len()),
41 Err(e) => tracing::warn!("Chunking failed: {}", e),
42 }
43
44 result
45}
46
47fn chunk_generic(text: &str) -> Result<Vec<Chunk>> {
48 let mut chunks = Vec::new();
49 let lines: Vec<&str> = text.lines().collect();
50 let chunk_size = 20;
51 let overlap = 5;
52
53 let mut i = 0;
54 while i < lines.len() {
55 let end = (i + chunk_size).min(lines.len());
56 let chunk_lines = &lines[i..end];
57 let chunk_text = chunk_lines.join("\n");
58
59 let byte_start = lines[0..i].iter().map(|l| l.len() + 1).sum::<usize>();
60 let byte_end = byte_start + chunk_text.len();
61
62 chunks.push(Chunk {
63 span: Span {
64 byte_start,
65 byte_end,
66 line_start: i + 1,
67 line_end: end,
68 },
69 text: chunk_text,
70 chunk_type: ChunkType::Text,
71 });
72
73 i += chunk_size - overlap;
74 if i >= lines.len() {
75 break;
76 }
77 }
78
79 Ok(chunks)
80}
81
82fn chunk_python(text: &str) -> Result<Vec<Chunk>> {
83 let mut parser = tree_sitter::Parser::new();
84 parser.set_language(tree_sitter_python::language())?;
85
86 let tree = parser.parse(text, None).ok_or_else(|| {
87 anyhow::anyhow!("Failed to parse Python code")
88 })?;
89
90 let mut chunks = Vec::new();
91 let mut cursor = tree.root_node().walk();
92
93 extract_code_chunks(&mut cursor, text, &mut chunks, "python");
94
95 if chunks.is_empty() {
96 return chunk_generic(text);
97 }
98
99 Ok(chunks)
100}
101
102fn chunk_typescript(text: &str) -> Result<Vec<Chunk>> {
103 let mut parser = tree_sitter::Parser::new();
104 parser.set_language(tree_sitter_typescript::language_typescript())?;
105
106 let tree = parser.parse(text, None).ok_or_else(|| {
107 anyhow::anyhow!("Failed to parse TypeScript code")
108 })?;
109
110 let mut chunks = Vec::new();
111 let mut cursor = tree.root_node().walk();
112
113 extract_code_chunks(&mut cursor, text, &mut chunks, "typescript");
114
115 if chunks.is_empty() {
116 return chunk_generic(text);
117 }
118
119 Ok(chunks)
120}
121
122fn extract_code_chunks(
123 cursor: &mut tree_sitter::TreeCursor,
124 source: &str,
125 chunks: &mut Vec<Chunk>,
126 language: &str,
127) {
128 let node = cursor.node();
129 let node_kind = node.kind();
130
131 let is_chunk = match language {
132 "python" => matches!(node_kind, "function_definition" | "class_definition"),
133 "typescript" | "javascript" => matches!(
134 node_kind,
135 "function_declaration" | "class_declaration" | "method_definition" | "arrow_function"
136 ),
137 _ => false,
138 };
139
140 if is_chunk {
141 let start_byte = node.start_byte();
142 let end_byte = node.end_byte();
143 let start_pos = node.start_position();
144 let end_pos = node.end_position();
145
146 let text = &source[start_byte..end_byte];
147
148 let chunk_type = match node_kind {
149 "function_definition" | "function_declaration" | "arrow_function" => ChunkType::Function,
150 "class_definition" | "class_declaration" => ChunkType::Class,
151 "method_definition" => ChunkType::Method,
152 _ => ChunkType::Text,
153 };
154
155 chunks.push(Chunk {
156 span: Span {
157 byte_start: start_byte,
158 byte_end: end_byte,
159 line_start: start_pos.row + 1,
160 line_end: end_pos.row + 1,
161 },
162 text: text.to_string(),
163 chunk_type,
164 });
165 }
166
167 if cursor.goto_first_child() {
168 loop {
169 extract_code_chunks(cursor, source, chunks, language);
170 if !cursor.goto_next_sibling() {
171 break;
172 }
173 }
174 cursor.goto_parent();
175 }
176}