1#![allow(dead_code)]
2
3use super::{Chunk, ChunkKind, Chunker, DEFAULT_CONTEXT_LINES};
4use crate::cache::normalize_path;
5use crate::chunker::extractor::{get_extractor, LanguageExtractor};
6use crate::chunker::parser::CodeParser;
7use crate::file::Language;
8use anyhow::Result;
9use std::path::Path;
10use tree_sitter::Node;
11
12pub struct SemanticChunker {
14 parser: CodeParser,
15 max_chunk_lines: usize,
16 max_chunk_chars: usize,
17 overlap_lines: usize,
18 context_lines: usize,
19}
20
21impl SemanticChunker {
22 pub fn new(max_chunk_lines: usize, max_chunk_chars: usize, overlap_lines: usize) -> Self {
23 Self {
24 parser: CodeParser::new(),
25 max_chunk_lines,
26 max_chunk_chars,
27 overlap_lines,
28 context_lines: DEFAULT_CONTEXT_LINES,
29 }
30 }
31
32 pub fn with_context_lines(mut self, lines: usize) -> Self {
34 self.context_lines = lines;
35 self
36 }
37
38 pub fn chunk_semantic(
40 &mut self,
41 language: Language,
42 path: &Path,
43 content: &str,
44 ) -> Result<Vec<Chunk>> {
45 let extractor = match get_extractor(language) {
47 Some(ext) => ext,
48 None => {
49 return Ok(self.fallback_chunk(path, content));
51 }
52 };
53
54 let parsed = self.parser.parse(language, content)?;
56
57 let mut definition_chunks = Vec::new();
59 let mut gap_tracker = GapTracker::new(content);
60
61 let file_context = format!("File: {}", normalize_path(path));
62 self.visit_node(
63 parsed.root_node(),
64 parsed.source().as_bytes(),
65 &*extractor,
66 &[file_context],
67 &mut definition_chunks,
68 &mut gap_tracker,
69 );
70
71 let gap_chunks = gap_tracker.extract_gaps(path);
73
74 let mut all_chunks = definition_chunks;
76 all_chunks.extend(gap_chunks);
77 all_chunks.sort_by_key(|c| c.start_line);
78
79 let source_lines: Vec<&str> = content.lines().collect();
81 self.populate_context_windows(&mut all_chunks, &source_lines);
82
83 let final_chunks = all_chunks
85 .into_iter()
86 .flat_map(|c| self.split_if_needed(c))
87 .collect();
88
89 Ok(final_chunks)
90 }
91
92 fn populate_context_windows(&self, chunks: &mut [Chunk], source_lines: &[&str]) {
94 let total_lines = source_lines.len();
95
96 for chunk in chunks.iter_mut() {
97 if chunk.start_line > 0 && self.context_lines > 0 {
99 let prev_start = chunk.start_line.saturating_sub(self.context_lines);
100 let prev_end = chunk.start_line;
101 if prev_start < prev_end && prev_end <= total_lines {
102 let prev_lines = &source_lines[prev_start..prev_end];
103 let prev_content = prev_lines.join("\n");
104 if !prev_content.trim().is_empty() {
105 chunk.context_prev = Some(prev_content);
106 }
107 }
108 }
109
110 if chunk.end_line < total_lines && self.context_lines > 0 {
112 let next_start = chunk.end_line;
113 let next_end = (chunk.end_line + self.context_lines).min(total_lines);
114 if next_start < next_end {
115 let next_lines = &source_lines[next_start..next_end];
116 let next_content = next_lines.join("\n");
117 if !next_content.trim().is_empty() {
118 chunk.context_next = Some(next_content);
119 }
120 }
121 }
122 }
123 }
124
125 fn visit_node(
127 &self,
128 node: Node,
129 source: &[u8],
130 extractor: &dyn LanguageExtractor,
131 context_stack: &[String],
132 chunks: &mut Vec<Chunk>,
133 gap_tracker: &mut GapTracker,
134 ) {
135 let is_definition = extractor.definition_types().contains(&node.kind());
137
138 if is_definition {
139 gap_tracker.mark_covered(node.start_position().row, node.end_position().row);
141
142 let mut prev = node.prev_named_sibling();
145 while let Some(sibling) = prev {
146 let sib_kind = sibling.kind();
147 if sib_kind == "line_comment"
148 || sib_kind == "block_comment"
149 || sib_kind == "attribute_item"
150 || sib_kind == "attribute"
151 || sib_kind == "decorator"
152 {
153 if let Ok(text) = sibling.utf8_text(source) {
154 let text = text.trim();
155 if text.starts_with("///")
158 || text.starts_with("//!")
159 || text.starts_with("/**")
160 || text.starts_with("/*!")
161 || text.starts_with("#[")
162 || text.starts_with("@")
163 {
164 gap_tracker.mark_covered(
165 sibling.start_position().row,
166 sibling.end_position().row,
167 );
168 prev = sibling.prev_named_sibling();
169 continue;
170 }
171 }
172 break;
173 }
174 break;
175 }
176
177 let kind = extractor.classify(node);
179 let name = extractor.extract_name(node, source);
180 let signature = extractor.extract_signature(node, source);
181 let docstring = extractor.extract_docstring(node, source);
182
183 let label = extractor
185 .build_label(node, source)
186 .or_else(|| name.as_ref().map(|n| format!("{:?}: {}", kind, n)))
187 .unwrap_or_else(|| format!("{:?}", kind));
188
189 let mut new_context = context_stack.to_vec();
191 new_context.push(label);
192
193 let content = match node.utf8_text(source) {
195 Ok(text) => text.to_string(),
196 Err(_) => return, };
198
199 let path_str = context_stack
201 .first()
202 .map(|s| s.strip_prefix("File: ").unwrap_or(s))
203 .unwrap_or("")
204 .to_string();
205
206 let mut chunk = Chunk::new(
207 content,
208 node.start_position().row,
209 node.end_position().row + 1, kind,
211 path_str,
212 );
213 chunk.context = new_context.clone();
214 chunk.signature = signature;
215 chunk.docstring = docstring;
216
217 chunks.push(chunk);
218
219 let mut cursor = node.walk();
221 for child in node.named_children(&mut cursor) {
222 self.visit_node(child, source, extractor, &new_context, chunks, gap_tracker);
223 }
224 } else {
225 let mut cursor = node.walk();
227 for child in node.named_children(&mut cursor) {
228 self.visit_node(child, source, extractor, context_stack, chunks, gap_tracker);
229 }
230 }
231 }
232
233 fn fallback_chunk(&self, path: &Path, content: &str) -> Vec<Chunk> {
235 let lines: Vec<&str> = content.lines().collect();
236 let mut chunks = Vec::new();
237 let stride = (self.max_chunk_lines - self.overlap_lines).max(1);
238
239 let path_str = normalize_path(path);
240 let context = vec![format!("File: {}", path_str)];
241
242 let mut i = 0;
243 while i < lines.len() {
244 let end = (i + self.max_chunk_lines).min(lines.len());
245 let chunk_lines = &lines[i..end];
246
247 if !chunk_lines.is_empty() {
248 let content = chunk_lines.join("\n");
249 let mut chunk = Chunk::new(content, i, end, ChunkKind::Block, path_str.clone());
250 chunk.context = context.clone();
251 chunks.push(chunk);
252 }
253
254 i += stride;
255 }
256
257 chunks
258 }
259
260 fn split_if_needed(&self, chunk: Chunk) -> Vec<Chunk> {
262 let line_count = chunk.line_count();
263 let char_count = chunk.size_bytes();
264
265 if line_count <= self.max_chunk_lines && char_count <= self.max_chunk_chars {
267 return vec![chunk];
268 }
269
270 let lines: Vec<&str> = chunk.content.lines().collect();
272 let mut split_chunks = Vec::new();
273 let stride = (self.max_chunk_lines - self.overlap_lines).max(1);
274
275 let mut i = 0;
276 let mut split_index = 0;
277
278 while i < lines.len() {
279 let end = (i + self.max_chunk_lines).min(lines.len());
280 let chunk_lines = &lines[i..end];
281
282 if !chunk_lines.is_empty() {
283 let content = chunk_lines.join("\n");
284 let mut split_chunk = Chunk::new(
285 content,
286 chunk.start_line + i,
287 chunk.start_line + end,
288 chunk.kind,
289 chunk.path.clone(),
290 );
291
292 split_chunk.context = chunk.context.clone();
294 split_chunk.signature = chunk.signature.clone();
295 split_chunk.docstring = if split_index == 0 {
296 chunk.docstring.clone() } else {
298 None
299 };
300 split_chunk.is_complete = false;
301 split_chunk.split_index = Some(split_index);
302
303 split_chunks.push(split_chunk);
304 split_index += 1;
305 }
306
307 i += stride;
308 }
309
310 let total_parts = split_chunks.len();
312 for chunk in &mut split_chunks {
313 if let Some(idx) = chunk.split_index {
314 let header = format!(
315 "// [Part {}/{}] {}\n",
316 idx + 1,
317 total_parts,
318 chunk
319 .signature
320 .as_ref()
321 .unwrap_or(&"(continued)".to_string())
322 );
323 chunk.content = header + &chunk.content;
324 }
325 }
326
327 split_chunks
328 }
329}
330
331impl Chunker for SemanticChunker {
332 fn chunk_file(&self, path: &Path, content: &str) -> Result<Vec<Chunk>> {
333 let language = Language::from_path(path);
335
336 let mut temp_chunker = SemanticChunker::new(
339 self.max_chunk_lines,
340 self.max_chunk_chars,
341 self.overlap_lines,
342 );
343
344 temp_chunker.chunk_semantic(language, path, content)
345 }
346}
347
348struct GapTracker<'a> {
350 #[allow(dead_code)]
351 content: &'a str,
352 lines: Vec<&'a str>,
353 covered: Vec<bool>, }
355
356impl<'a> GapTracker<'a> {
357 fn new(content: &'a str) -> Self {
358 let lines: Vec<&str> = content.lines().collect();
359 let covered = vec![false; lines.len()];
360
361 Self {
362 content,
363 lines,
364 covered,
365 }
366 }
367
368 fn mark_covered(&mut self, start_line: usize, end_line: usize) {
370 for i in start_line..=end_line.min(self.covered.len().saturating_sub(1)) {
371 if i < self.covered.len() {
372 self.covered[i] = true;
373 }
374 }
375 }
376
377 fn extract_gaps(&self, path: &Path) -> Vec<Chunk> {
379 let mut gaps = Vec::new();
380 let path_str = normalize_path(path);
381 let context = vec![format!("File: {}", path_str)];
382
383 let mut gap_start: Option<usize> = None;
384
385 for (i, &is_covered) in self.covered.iter().enumerate() {
386 if !is_covered {
387 if gap_start.is_none() {
389 gap_start = Some(i);
390 }
391 } else {
392 if let Some(start) = gap_start {
394 let gap_lines = &self.lines[start..i];
396 let gap_content = gap_lines.join("\n");
397
398 if !gap_content.trim().is_empty() {
400 let kind = Self::classify_gap(&gap_content);
401 let line_count = i - start;
402 let mut chunk = Chunk::new(gap_content, start, i, kind, path_str.clone());
403 chunk.context = context.clone();
404 chunk.signature = Some(Self::gap_signature(kind, line_count));
405 gaps.push(chunk);
406 }
407
408 gap_start = None;
409 }
410 }
411 }
412
413 if let Some(start) = gap_start {
415 let gap_lines = &self.lines[start..];
416 let gap_content = gap_lines.join("\n");
417
418 if !gap_content.trim().is_empty() {
419 let kind = Self::classify_gap(&gap_content);
420 let line_count = self.lines.len() - start;
421 let mut chunk =
422 Chunk::new(gap_content, start, self.lines.len(), kind, path_str.clone());
423 chunk.context = context.clone();
424 chunk.signature = Some(Self::gap_signature(kind, line_count));
425 gaps.push(chunk);
426 }
427 }
428
429 gaps
430 }
431
432 fn gap_signature(kind: ChunkKind, line_count: usize) -> String {
434 match kind {
435 ChunkKind::Imports => format!("imports ({} lines)", line_count),
436 ChunkKind::ModuleDocs => format!("module docs ({} lines)", line_count),
437 ChunkKind::Comment => format!("comment block ({} lines)", line_count),
438 _ => format!("block ({} lines)", line_count),
439 }
440 }
441
442 fn classify_gap(content: &str) -> ChunkKind {
444 let trimmed = content.trim();
445 let total_lines = trimmed.lines().count();
446
447 let import_count = trimmed
449 .lines()
450 .filter(|line| {
451 let line = line.trim();
452 line.starts_with("import ")
453 || line.starts_with("from ")
454 || line.starts_with("use ")
455 || line.starts_with("#include")
456 })
457 .count();
458
459 if total_lines > 0 && import_count > total_lines / 2 {
460 return ChunkKind::Imports;
461 }
462
463 if trimmed.starts_with("//!") || trimmed.starts_with("/*!") {
465 return ChunkKind::ModuleDocs;
466 }
467
468 let comment_count = trimmed
470 .lines()
471 .filter(|line| {
472 let line = line.trim();
473 line.starts_with("//")
474 || line.starts_with("/*")
475 || line.starts_with("*")
476 || line.starts_with("#") || line.is_empty() })
479 .count();
480
481 if total_lines > 0 && comment_count > total_lines / 2 {
482 return ChunkKind::Comment;
483 }
484
485 ChunkKind::Block
486 }
487}
488
489#[cfg(test)]
490mod tests {
491 use super::*;
492
493 #[test]
494 fn test_semantic_chunker_creation() {
495 let chunker = SemanticChunker::new(100, 2000, 10);
496 assert_eq!(chunker.max_chunk_lines, 100);
497 assert_eq!(chunker.max_chunk_chars, 2000);
498 assert_eq!(chunker.overlap_lines, 10);
499 }
500
501 #[test]
502 fn test_chunk_rust_code() {
503 let mut chunker = SemanticChunker::new(100, 2000, 10);
504
505 let rust_code = r#"
506/// This is a doc comment
507fn hello_world() {
508 println!("Hello, world!");
509}
510
511fn add(a: i32, b: i32) -> i32 {
512 a + b
513}
514
515struct Point {
516 x: f64,
517 y: f64,
518}
519"#;
520
521 let path = Path::new("test.rs");
522 let chunks = chunker
523 .chunk_semantic(Language::Rust, path, rust_code)
524 .unwrap();
525
526 assert!(
528 chunks.len() >= 3,
529 "Expected at least 3 chunks, got {}",
530 chunks.len()
531 );
532
533 let function_chunks: Vec<_> = chunks
535 .iter()
536 .filter(|c| c.kind == ChunkKind::Function)
537 .collect();
538 assert!(
539 function_chunks.len() >= 2,
540 "Expected at least 2 function chunks"
541 );
542
543 let hello_chunk = function_chunks
545 .iter()
546 .find(|c| c.content.contains("hello_world"));
547 assert!(hello_chunk.is_some(), "Should find hello_world function");
548
549 if let Some(chunk) = hello_chunk {
550 assert!(chunk.signature.is_some(), "Should have signature");
551 assert!(chunk.signature.as_ref().unwrap().contains("fn hello_world"));
552 }
553 }
554
555 #[test]
556 fn test_chunk_python_code() {
557 let mut chunker = SemanticChunker::new(100, 2000, 10);
558
559 let python_code = r#"
560def hello():
561 """Say hello"""
562 print("Hello!")
563
564class Calculator:
565 """A simple calculator"""
566
567 def add(self, a, b):
568 """Add two numbers"""
569 return a + b
570"#;
571
572 let path = Path::new("test.py");
573 let chunks = chunker
574 .chunk_semantic(Language::Python, path, python_code)
575 .unwrap();
576
577 assert!(chunks.len() >= 2, "Expected at least 2 chunks");
579
580 let chunks_with_docs: Vec<_> = chunks.iter().filter(|c| c.docstring.is_some()).collect();
582 assert!(
583 !chunks_with_docs.is_empty(),
584 "Should have chunks with docstrings"
585 );
586 }
587
588 #[test]
589 fn test_chunk_unsupported_language() {
590 let mut chunker = SemanticChunker::new(100, 2000, 10);
591
592 let content =
593 "Some random text file\nWith multiple lines\nThat should be chunked\nAs fallback";
594 let path = Path::new("test.txt");
595
596 let chunks = chunker
597 .chunk_semantic(Language::Unknown, path, content)
598 .unwrap();
599
600 assert!(!chunks.is_empty());
602 assert!(chunks.iter().all(|c| c.kind == ChunkKind::Block));
603 }
604
605 #[test]
606 fn test_gap_tracking() {
607 let content = "line 0\nline 1\nline 2\nline 3\nline 4";
608 let mut tracker = GapTracker::new(content);
609
610 tracker.mark_covered(1, 2);
612
613 let path = Path::new("test.txt");
615 let gaps = tracker.extract_gaps(path);
616
617 assert_eq!(gaps.len(), 2, "Should have 2 gaps");
618 assert_eq!(gaps[0].start_line, 0);
619 assert_eq!(gaps[0].end_line, 1);
620 assert_eq!(gaps[1].start_line, 3);
621 assert_eq!(gaps[1].end_line, 5);
622 }
623
624 #[test]
625 fn test_chunk_splitting() {
626 let chunker = SemanticChunker::new(5, 100, 1); let large_content = (0..20)
629 .map(|i| format!("line {}", i))
630 .collect::<Vec<_>>()
631 .join("\n");
632 let chunk = Chunk::new(
633 large_content,
634 0,
635 20,
636 ChunkKind::Function,
637 "test.rs".to_string(),
638 );
639
640 let splits = chunker.split_if_needed(chunk);
641
642 assert!(splits.len() > 1, "Should split large chunk");
644
645 for split in &splits {
647 assert!(
648 !split.is_complete,
649 "Split chunks should be marked incomplete"
650 );
651 assert!(
652 split.split_index.is_some(),
653 "Split chunks should have index"
654 );
655 }
656 }
657
658 #[test]
659 fn test_context_breadcrumbs() {
660 let mut chunker = SemanticChunker::new(100, 2000, 10);
661
662 let rust_code = r#"
663impl MyStruct {
664 fn method(&self) {
665 println!("method");
666 }
667}
668"#;
669
670 let path = Path::new("test.rs");
671 let chunks = chunker
672 .chunk_semantic(Language::Rust, path, rust_code)
673 .unwrap();
674
675 let method_chunk = chunks.iter().find(|c| c.kind == ChunkKind::Method);
677
678 if let Some(chunk) = method_chunk {
679 assert!(chunk.context.len() >= 2, "Should have nested context");
681 assert!(chunk.context[0].contains("File:"));
682 }
683 }
684}