1mod strategies;
7
8use crate::config::ChunkingConfig;
9use crate::error::Result;
10use crate::types::{Chunk, ChunkId, ChunkType, Document, Language};
11
12pub trait Chunker: Send + Sync {
14 fn chunk(&self, document: &Document, content: &str) -> Result<Vec<Chunk>>;
16
17 fn name(&self) -> &'static str;
19}
20
21pub struct DefaultChunker {
23 config: ChunkingConfig,
24}
25
26impl DefaultChunker {
27 pub fn new(config: ChunkingConfig) -> Result<Self> {
29 Ok(Self { config })
30 }
31}
32
33impl Chunker for DefaultChunker {
34 fn chunk(&self, document: &Document, content: &str) -> Result<Vec<Chunk>> {
35 if content.len() < self.config.min_chunk_size {
37 return Ok(vec![]);
38 }
39
40 let chunks = extract_chunks_by_language(document, content, &self.config);
42
43 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
45 return Ok(vec![Chunk {
46 id: ChunkId::new(),
47 document_id: document.id.clone(),
48 content: content.to_string(),
49 chunk_type: ChunkType::Block,
50 start_line: 1,
51 end_line: content.lines().count() as u32,
52 start_byte: 0,
53 end_byte: content.len(),
54 symbol_name: None,
55 parent_symbol: None,
56 }]);
57 }
58
59 Ok(chunks)
60 }
61
62 fn name(&self) -> &'static str {
63 "default"
64 }
65}
66
67fn extract_chunks_by_language(
69 document: &Document,
70 content: &str,
71 config: &ChunkingConfig,
72) -> Vec<Chunk> {
73 match document.language {
74 Language::Rust => extract_rust_chunks(document, content, config),
75 Language::Python => extract_python_chunks(document, content, config),
76 Language::JavaScript | Language::TypeScript => {
77 extract_js_chunks(document, content, config)
78 }
79 Language::Go => extract_go_chunks(document, content, config),
80 Language::Java => extract_java_chunks(document, content, config),
81 _ => extract_generic_chunks(document, content, config),
82 }
83}
84
85fn extract_rust_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
87 let mut chunks = Vec::new();
88 let lines: Vec<&str> = content.lines().collect();
89
90 let mut i = 0;
91 while i < lines.len() {
92 let line = lines[i].trim();
93
94 let (chunk_type, name) = if line.starts_with("pub fn ")
96 || line.starts_with("fn ")
97 || line.starts_with("pub async fn ")
98 || line.starts_with("async fn ")
99 {
100 (ChunkType::Function, extract_name_after(line, "fn "))
101 } else if line.starts_with("pub struct ") || line.starts_with("struct ") {
102 (ChunkType::Struct, extract_name_after(line, "struct "))
103 } else if line.starts_with("pub enum ") || line.starts_with("enum ") {
104 (ChunkType::Enum, extract_name_after(line, "enum "))
105 } else if line.starts_with("impl ") {
106 (ChunkType::Implementation, extract_impl_name(line))
107 } else if line.starts_with("pub trait ") || line.starts_with("trait ") {
108 (ChunkType::Interface, extract_name_after(line, "trait "))
109 } else if line.starts_with("mod ") || line.starts_with("pub mod ") {
110 (ChunkType::Module, extract_name_after(line, "mod "))
111 } else {
112 i += 1;
113 continue;
114 };
115
116 let start_line = i;
118 let end_line = find_block_end(&lines, i);
119
120 let chunk_content: String = lines[start_line..=end_line].join("\n");
121
122 if chunk_content.len() >= config.min_chunk_size
123 && chunk_content.len() <= config.max_chunk_size
124 {
125 let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
126 let end_byte = start_byte + chunk_content.len();
127
128 chunks.push(Chunk {
129 id: ChunkId::new(),
130 document_id: document.id.clone(),
131 content: chunk_content,
132 chunk_type,
133 start_line: start_line as u32 + 1,
134 end_line: end_line as u32 + 1,
135 start_byte,
136 end_byte,
137 symbol_name: name,
138 parent_symbol: None,
139 });
140 }
141
142 i = end_line + 1;
143 }
144
145 chunks
146}
147
148fn extract_python_chunks(
150 document: &Document,
151 content: &str,
152 config: &ChunkingConfig,
153) -> Vec<Chunk> {
154 let mut chunks = Vec::new();
155 let lines: Vec<&str> = content.lines().collect();
156
157 let mut i = 0;
158 while i < lines.len() {
159 let line = lines[i];
160 let trimmed = line.trim();
161
162 let (chunk_type, name) = if trimmed.starts_with("def ")
163 || trimmed.starts_with("async def ")
164 {
165 (ChunkType::Function, extract_name_after(trimmed, "def "))
166 } else if trimmed.starts_with("class ") {
167 (ChunkType::Class, extract_name_after(trimmed, "class "))
168 } else {
169 i += 1;
170 continue;
171 };
172
173 let indent = line.len() - line.trim_start().len();
175 let start_line = i;
176 let end_line = find_python_block_end(&lines, i, indent);
177
178 let chunk_content: String = lines[start_line..=end_line].join("\n");
179
180 if chunk_content.len() >= config.min_chunk_size
181 && chunk_content.len() <= config.max_chunk_size
182 {
183 let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
184 let end_byte = start_byte + chunk_content.len();
185
186 chunks.push(Chunk {
187 id: ChunkId::new(),
188 document_id: document.id.clone(),
189 content: chunk_content,
190 chunk_type,
191 start_line: start_line as u32 + 1,
192 end_line: end_line as u32 + 1,
193 start_byte,
194 end_byte,
195 symbol_name: name,
196 parent_symbol: None,
197 });
198 }
199
200 i = end_line + 1;
201 }
202
203 chunks
204}
205
206fn extract_js_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
208 let mut chunks = Vec::new();
209 let lines: Vec<&str> = content.lines().collect();
210
211 let mut i = 0;
212 while i < lines.len() {
213 let line = lines[i].trim();
214
215 let (chunk_type, name) = if line.starts_with("function ")
216 || line.starts_with("async function ")
217 || line.starts_with("export function ")
218 || line.starts_with("export async function ")
219 {
220 (ChunkType::Function, extract_name_after(line, "function "))
221 } else if line.starts_with("class ") || line.starts_with("export class ") {
222 (ChunkType::Class, extract_name_after(line, "class "))
223 } else if line.starts_with("interface ") || line.starts_with("export interface ") {
224 (ChunkType::Interface, extract_name_after(line, "interface "))
225 } else if line.contains("=>") && (line.starts_with("const ") || line.starts_with("export const ")) {
226 (ChunkType::Function, extract_name_after(line, "const "))
227 } else {
228 i += 1;
229 continue;
230 };
231
232 let start_line = i;
233 let end_line = find_block_end(&lines, i);
234
235 let chunk_content: String = lines[start_line..=end_line].join("\n");
236
237 if chunk_content.len() >= config.min_chunk_size
238 && chunk_content.len() <= config.max_chunk_size
239 {
240 let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
241 let end_byte = start_byte + chunk_content.len();
242
243 chunks.push(Chunk {
244 id: ChunkId::new(),
245 document_id: document.id.clone(),
246 content: chunk_content,
247 chunk_type,
248 start_line: start_line as u32 + 1,
249 end_line: end_line as u32 + 1,
250 start_byte,
251 end_byte,
252 symbol_name: name,
253 parent_symbol: None,
254 });
255 }
256
257 i = end_line + 1;
258 }
259
260 chunks
261}
262
263fn extract_go_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
265 let mut chunks = Vec::new();
266 let lines: Vec<&str> = content.lines().collect();
267
268 let mut i = 0;
269 while i < lines.len() {
270 let line = lines[i].trim();
271
272 let (chunk_type, name) = if line.starts_with("func ") {
273 (ChunkType::Function, extract_go_func_name(line))
274 } else if line.starts_with("type ") && line.contains("struct") {
275 (ChunkType::Struct, extract_name_after(line, "type "))
276 } else if line.starts_with("type ") && line.contains("interface") {
277 (ChunkType::Interface, extract_name_after(line, "type "))
278 } else {
279 i += 1;
280 continue;
281 };
282
283 let start_line = i;
284 let end_line = find_block_end(&lines, i);
285
286 let chunk_content: String = lines[start_line..=end_line].join("\n");
287
288 if chunk_content.len() >= config.min_chunk_size
289 && chunk_content.len() <= config.max_chunk_size
290 {
291 let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
292 let end_byte = start_byte + chunk_content.len();
293
294 chunks.push(Chunk {
295 id: ChunkId::new(),
296 document_id: document.id.clone(),
297 content: chunk_content,
298 chunk_type,
299 start_line: start_line as u32 + 1,
300 end_line: end_line as u32 + 1,
301 start_byte,
302 end_byte,
303 symbol_name: name,
304 parent_symbol: None,
305 });
306 }
307
308 i = end_line + 1;
309 }
310
311 chunks
312}
313
314fn extract_java_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
316 let mut chunks = Vec::new();
317 let lines: Vec<&str> = content.lines().collect();
318
319 let mut i = 0;
320 while i < lines.len() {
321 let line = lines[i].trim();
322
323 let (chunk_type, name) =
324 if line.contains("class ") && (line.starts_with("public") || line.starts_with("class"))
325 {
326 (ChunkType::Class, extract_name_after(line, "class "))
327 } else if line.contains("interface ")
328 && (line.starts_with("public") || line.starts_with("interface"))
329 {
330 (ChunkType::Interface, extract_name_after(line, "interface "))
331 } else if (line.starts_with("public ")
332 || line.starts_with("private ")
333 || line.starts_with("protected "))
334 && line.contains("(")
335 && !line.contains("class")
336 {
337 (ChunkType::Function, extract_java_method_name(line))
338 } else {
339 i += 1;
340 continue;
341 };
342
343 let start_line = i;
344 let end_line = find_block_end(&lines, i);
345
346 let chunk_content: String = lines[start_line..=end_line].join("\n");
347
348 if chunk_content.len() >= config.min_chunk_size
349 && chunk_content.len() <= config.max_chunk_size
350 {
351 let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
352 let end_byte = start_byte + chunk_content.len();
353
354 chunks.push(Chunk {
355 id: ChunkId::new(),
356 document_id: document.id.clone(),
357 content: chunk_content,
358 chunk_type,
359 start_line: start_line as u32 + 1,
360 end_line: end_line as u32 + 1,
361 start_byte,
362 end_byte,
363 symbol_name: name,
364 parent_symbol: None,
365 });
366 }
367
368 i = end_line + 1;
369 }
370
371 chunks
372}
373
374fn extract_generic_chunks(
376 document: &Document,
377 content: &str,
378 config: &ChunkingConfig,
379) -> Vec<Chunk> {
380 let lines: Vec<&str> = content.lines().collect();
381 let mut chunks = Vec::new();
382
383 let mut start = 0;
385 let mut current_chunk = String::new();
386
387 for (i, line) in lines.iter().enumerate() {
388 current_chunk.push_str(line);
389 current_chunk.push('\n');
390
391 let should_split = current_chunk.len() >= config.max_chunk_size
393 || (line.is_empty() && current_chunk.len() >= config.min_chunk_size);
394
395 if should_split {
396 let start_byte: usize = lines[..start].iter().map(|l| l.len() + 1).sum();
397
398 chunks.push(Chunk {
399 id: ChunkId::new(),
400 document_id: document.id.clone(),
401 content: current_chunk.trim().to_string(),
402 chunk_type: ChunkType::Block,
403 start_line: start as u32 + 1,
404 end_line: i as u32 + 1,
405 start_byte,
406 end_byte: start_byte + current_chunk.len(),
407 symbol_name: None,
408 parent_symbol: None,
409 });
410
411 current_chunk.clear();
412 start = i + 1;
413 }
414 }
415
416 if current_chunk.len() >= config.min_chunk_size {
418 let start_byte: usize = lines[..start].iter().map(|l| l.len() + 1).sum();
419
420 chunks.push(Chunk {
421 id: ChunkId::new(),
422 document_id: document.id.clone(),
423 content: current_chunk.trim().to_string(),
424 chunk_type: ChunkType::Block,
425 start_line: start as u32 + 1,
426 end_line: lines.len() as u32,
427 start_byte,
428 end_byte: start_byte + current_chunk.len(),
429 symbol_name: None,
430 parent_symbol: None,
431 });
432 }
433
434 chunks
435}
436
437fn extract_name_after(line: &str, keyword: &str) -> Option<String> {
440 line.find(keyword).and_then(|idx| {
441 let rest = &line[idx + keyword.len()..];
442 let end = rest
443 .find(|c: char| !c.is_alphanumeric() && c != '_')
444 .unwrap_or(rest.len());
445 if end > 0 {
446 Some(rest[..end].to_string())
447 } else {
448 None
449 }
450 })
451}
452
453fn extract_impl_name(line: &str) -> Option<String> {
454 let rest = line.strip_prefix("impl")?.trim();
456 let end = rest
457 .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<')
458 .unwrap_or(rest.len());
459 if end > 0 {
460 Some(rest[..end].to_string())
461 } else {
462 None
463 }
464}
465
466fn extract_go_func_name(line: &str) -> Option<String> {
467 let rest = line.strip_prefix("func")?.trim();
469 if rest.starts_with('(') {
470 let after_receiver = rest.find(')')? + 1;
472 let name_part = rest[after_receiver..].trim();
473 extract_name_after(name_part, "")
474 } else {
475 extract_name_after(rest, "")
476 }
477}
478
479fn extract_java_method_name(line: &str) -> Option<String> {
480 let paren_idx = line.find('(')?;
482 let before_paren = &line[..paren_idx];
483 let words: Vec<&str> = before_paren.split_whitespace().collect();
484 words.last().map(|s| s.to_string())
485}
486
487fn find_block_end(lines: &[&str], start: usize) -> usize {
488 let mut brace_count = 0;
489 let mut found_open = false;
490
491 for (i, line) in lines.iter().enumerate().skip(start) {
492 for c in line.chars() {
493 if c == '{' {
494 brace_count += 1;
495 found_open = true;
496 } else if c == '}' {
497 brace_count -= 1;
498 }
499 }
500
501 if found_open && brace_count == 0 {
502 return i;
503 }
504 }
505
506 lines.len().saturating_sub(1)
507}
508
509fn find_python_block_end(lines: &[&str], start: usize, base_indent: usize) -> usize {
510 for (i, line) in lines.iter().enumerate().skip(start + 1) {
511 if line.trim().is_empty() {
512 continue;
513 }
514 let indent = line.len() - line.trim_start().len();
515 if indent <= base_indent {
516 return i.saturating_sub(1);
517 }
518 }
519 lines.len().saturating_sub(1)
520}
521
522#[cfg(test)]
523mod tests {
524 use super::*;
525 use std::path::PathBuf;
526
527 fn make_document(language: Language) -> Document {
528 Document {
529 id: crate::types::DocumentId::new(),
530 relative_path: PathBuf::from("test.rs"),
531 absolute_path: PathBuf::from("/test/test.rs"),
532 language,
533 content_hash: "test".to_string(),
534 size_bytes: 0,
535 modified_at: chrono::Utc::now(),
536 }
537 }
538
539 #[test]
540 fn test_rust_chunking() {
541 let doc = make_document(Language::Rust);
542 let content = r#"
543fn hello() {
544 println!("Hello");
545}
546
547pub struct Point {
548 x: i32,
549 y: i32,
550}
551
552impl Point {
553 fn new() -> Self {
554 Self { x: 0, y: 0 }
555 }
556}
557"#;
558 let config = ChunkingConfig::default();
559 let chunks = extract_rust_chunks(&doc, content, &config);
560
561 assert!(chunks.len() >= 2);
562 assert!(chunks.iter().any(|c| c.symbol_name == Some("hello".to_string())));
563 }
564
565 #[test]
566 fn test_python_chunking() {
567 let doc = make_document(Language::Python);
568 let content = r#"
569def hello():
570 print("Hello")
571
572class MyClass:
573 def __init__(self):
574 pass
575"#;
576 let config = ChunkingConfig::default();
577 let chunks = extract_python_chunks(&doc, content, &config);
578
579 assert!(!chunks.is_empty());
580 }
581}