Skip to main content

codesearch/chunker/
semantic.rs

1#![allow(dead_code)]
2
3use super::{Chunk, ChunkKind, Chunker, DEFAULT_CONTEXT_LINES};
4use crate::cache::normalize_path;
5use crate::chunker::extractor::{get_extractor, LanguageExtractor};
6use crate::chunker::parser::CodeParser;
7use crate::file::Language;
8use anyhow::Result;
9use std::path::Path;
10use tree_sitter::Node;
11
12/// Smart semantic chunker using tree-sitter and language-specific extractors
13pub struct SemanticChunker {
14    parser: CodeParser,
15    max_chunk_lines: usize,
16    max_chunk_chars: usize,
17    overlap_lines: usize,
18    context_lines: usize,
19}
20
21impl SemanticChunker {
22    pub fn new(max_chunk_lines: usize, max_chunk_chars: usize, overlap_lines: usize) -> Self {
23        Self {
24            parser: CodeParser::new(),
25            max_chunk_lines,
26            max_chunk_chars,
27            overlap_lines,
28            context_lines: DEFAULT_CONTEXT_LINES,
29        }
30    }
31
32    /// Set the number of context lines to extract before/after each chunk
33    pub fn with_context_lines(mut self, lines: usize) -> Self {
34        self.context_lines = lines;
35        self
36    }
37
38    /// Chunk a file using semantic analysis
39    pub fn chunk_semantic(
40        &mut self,
41        language: Language,
42        path: &Path,
43        content: &str,
44    ) -> Result<Vec<Chunk>> {
45        // 1. Check if we have an extractor for this language
46        let extractor = match get_extractor(language) {
47            Some(ext) => ext,
48            None => {
49                // Fall back to simple chunking for unsupported languages
50                return Ok(self.fallback_chunk(path, content));
51            }
52        };
53
54        // 2. Parse the code
55        let parsed = self.parser.parse(language, content)?;
56
57        // 3. Visit AST and extract chunks
58        let mut definition_chunks = Vec::new();
59        let mut gap_tracker = GapTracker::new(content);
60
61        let file_context = format!("File: {}", normalize_path(path));
62        self.visit_node(
63            parsed.root_node(),
64            parsed.source().as_bytes(),
65            &*extractor,
66            &[file_context],
67            &mut definition_chunks,
68            &mut gap_tracker,
69        );
70
71        // 4. Extract gap chunks (code between definitions)
72        let gap_chunks = gap_tracker.extract_gaps(path);
73
74        // 5. Combine and sort all chunks by position
75        let mut all_chunks = definition_chunks;
76        all_chunks.extend(gap_chunks);
77        all_chunks.sort_by_key(|c| c.start_line);
78
79        // 6. Populate context windows (lines before/after each chunk)
80        let source_lines: Vec<&str> = content.lines().collect();
81        self.populate_context_windows(&mut all_chunks, &source_lines);
82
83        // 7. Split oversized chunks
84        let final_chunks = all_chunks
85            .into_iter()
86            .flat_map(|c| self.split_if_needed(c))
87            .collect();
88
89        Ok(final_chunks)
90    }
91
92    /// Populate context_prev and context_next for each chunk
93    fn populate_context_windows(&self, chunks: &mut [Chunk], source_lines: &[&str]) {
94        let total_lines = source_lines.len();
95
96        for chunk in chunks.iter_mut() {
97            // Extract context_prev (N lines before start_line)
98            if chunk.start_line > 0 && self.context_lines > 0 {
99                let prev_start = chunk.start_line.saturating_sub(self.context_lines);
100                let prev_end = chunk.start_line;
101                if prev_start < prev_end && prev_end <= total_lines {
102                    let prev_lines = &source_lines[prev_start..prev_end];
103                    let prev_content = prev_lines.join("\n");
104                    if !prev_content.trim().is_empty() {
105                        chunk.context_prev = Some(prev_content);
106                    }
107                }
108            }
109
110            // Extract context_next (N lines after end_line)
111            if chunk.end_line < total_lines && self.context_lines > 0 {
112                let next_start = chunk.end_line;
113                let next_end = (chunk.end_line + self.context_lines).min(total_lines);
114                if next_start < next_end {
115                    let next_lines = &source_lines[next_start..next_end];
116                    let next_content = next_lines.join("\n");
117                    if !next_content.trim().is_empty() {
118                        chunk.context_next = Some(next_content);
119                    }
120                }
121            }
122        }
123    }
124
125    /// Recursively visit AST nodes and extract chunks
126    fn visit_node(
127        &self,
128        node: Node,
129        source: &[u8],
130        extractor: &dyn LanguageExtractor,
131        context_stack: &[String],
132        chunks: &mut Vec<Chunk>,
133        gap_tracker: &mut GapTracker,
134    ) {
135        // Check if this node is a definition
136        let is_definition = extractor.definition_types().contains(&node.kind());
137
138        if is_definition {
139            // Mark this range as covered (not a gap)
140            gap_tracker.mark_covered(node.start_position().row, node.end_position().row);
141
142            // Also mark preceding doc comments and attributes as covered
143            // (they belong to this definition, not to a gap)
144            let mut prev = node.prev_named_sibling();
145            while let Some(sibling) = prev {
146                let sib_kind = sibling.kind();
147                if sib_kind == "line_comment"
148                    || sib_kind == "block_comment"
149                    || sib_kind == "attribute_item"
150                    || sib_kind == "attribute"
151                    || sib_kind == "decorator"
152                {
153                    if let Ok(text) = sibling.utf8_text(source) {
154                        let text = text.trim();
155                        // Only mark doc comments (///, //!, /**, /*!), attributes (#[...]),
156                        // and decorators (@...) as covered — not regular comments
157                        if text.starts_with("///")
158                            || text.starts_with("//!")
159                            || text.starts_with("/**")
160                            || text.starts_with("/*!")
161                            || text.starts_with("#[")
162                            || text.starts_with("@")
163                        {
164                            gap_tracker.mark_covered(
165                                sibling.start_position().row,
166                                sibling.end_position().row,
167                            );
168                            prev = sibling.prev_named_sibling();
169                            continue;
170                        }
171                    }
172                    break;
173                }
174                break;
175            }
176
177            // Extract metadata using the language extractor
178            let kind = extractor.classify(node);
179            let name = extractor.extract_name(node, source);
180            let signature = extractor.extract_signature(node, source);
181            let docstring = extractor.extract_docstring(node, source);
182
183            // Build label for context breadcrumb
184            let label = extractor
185                .build_label(node, source)
186                .or_else(|| name.as_ref().map(|n| format!("{:?}: {}", kind, n)))
187                .unwrap_or_else(|| format!("{:?}", kind));
188
189            // Build new context stack
190            let mut new_context = context_stack.to_vec();
191            new_context.push(label);
192
193            // Extract content (without docstring if we have it separate)
194            let content = match node.utf8_text(source) {
195                Ok(text) => text.to_string(),
196                Err(_) => return, // Skip if we can't extract text
197            };
198
199            // Create chunk
200            let path_str = context_stack
201                .first()
202                .map(|s| s.strip_prefix("File: ").unwrap_or(s))
203                .unwrap_or("")
204                .to_string();
205
206            let mut chunk = Chunk::new(
207                content,
208                node.start_position().row,
209                node.end_position().row + 1, // tree-sitter uses 0-based, we use line count
210                kind,
211                path_str,
212            );
213            chunk.context = new_context.clone();
214            chunk.signature = signature;
215            chunk.docstring = docstring;
216
217            chunks.push(chunk);
218
219            // Visit children with updated context
220            let mut cursor = node.walk();
221            for child in node.named_children(&mut cursor) {
222                self.visit_node(child, source, extractor, &new_context, chunks, gap_tracker);
223            }
224        } else {
225            // Not a definition, just visit children with same context
226            let mut cursor = node.walk();
227            for child in node.named_children(&mut cursor) {
228                self.visit_node(child, source, extractor, context_stack, chunks, gap_tracker);
229            }
230        }
231    }
232
233    /// Fallback chunking for unsupported languages
234    fn fallback_chunk(&self, path: &Path, content: &str) -> Vec<Chunk> {
235        let lines: Vec<&str> = content.lines().collect();
236        let mut chunks = Vec::new();
237        let stride = (self.max_chunk_lines - self.overlap_lines).max(1);
238
239        let path_str = normalize_path(path);
240        let context = vec![format!("File: {}", path_str)];
241
242        let mut i = 0;
243        while i < lines.len() {
244            let end = (i + self.max_chunk_lines).min(lines.len());
245            let chunk_lines = &lines[i..end];
246
247            if !chunk_lines.is_empty() {
248                let content = chunk_lines.join("\n");
249                let mut chunk = Chunk::new(content, i, end, ChunkKind::Block, path_str.clone());
250                chunk.context = context.clone();
251                chunks.push(chunk);
252            }
253
254            i += stride;
255        }
256
257        chunks
258    }
259
260    /// Split a chunk if it exceeds size limits
261    fn split_if_needed(&self, chunk: Chunk) -> Vec<Chunk> {
262        let line_count = chunk.line_count();
263        let char_count = chunk.size_bytes();
264
265        // Check if splitting is needed
266        if line_count <= self.max_chunk_lines && char_count <= self.max_chunk_chars {
267            return vec![chunk];
268        }
269
270        // Need to split
271        let lines: Vec<&str> = chunk.content.lines().collect();
272        let mut split_chunks = Vec::new();
273        let stride = (self.max_chunk_lines - self.overlap_lines).max(1);
274
275        let mut i = 0;
276        let mut split_index = 0;
277
278        while i < lines.len() {
279            let end = (i + self.max_chunk_lines).min(lines.len());
280            let chunk_lines = &lines[i..end];
281
282            if !chunk_lines.is_empty() {
283                let content = chunk_lines.join("\n");
284                let mut split_chunk = Chunk::new(
285                    content,
286                    chunk.start_line + i,
287                    chunk.start_line + end,
288                    chunk.kind,
289                    chunk.path.clone(),
290                );
291
292                // Preserve metadata
293                split_chunk.context = chunk.context.clone();
294                split_chunk.signature = chunk.signature.clone();
295                split_chunk.docstring = if split_index == 0 {
296                    chunk.docstring.clone() // Only first chunk gets docstring
297                } else {
298                    None
299                };
300                split_chunk.is_complete = false;
301                split_chunk.split_index = Some(split_index);
302
303                split_chunks.push(split_chunk);
304                split_index += 1;
305            }
306
307            i += stride;
308        }
309
310        // Add header to split chunks to indicate they're partial
311        let total_parts = split_chunks.len();
312        for chunk in &mut split_chunks {
313            if let Some(idx) = chunk.split_index {
314                let header = format!(
315                    "// [Part {}/{}] {}\n",
316                    idx + 1,
317                    total_parts,
318                    chunk
319                        .signature
320                        .as_ref()
321                        .unwrap_or(&"(continued)".to_string())
322                );
323                chunk.content = header + &chunk.content;
324            }
325        }
326
327        split_chunks
328    }
329}
330
331impl Chunker for SemanticChunker {
332    fn chunk_file(&self, path: &Path, content: &str) -> Result<Vec<Chunk>> {
333        // Detect language from path
334        let language = Language::from_path(path);
335
336        // Can't use &mut self in trait method, so we need a workaround
337        // Create a temporary parser for this call
338        let mut temp_chunker = SemanticChunker::new(
339            self.max_chunk_lines,
340            self.max_chunk_chars,
341            self.overlap_lines,
342        );
343
344        temp_chunker.chunk_semantic(language, path, content)
345    }
346}
347
348/// Helper to track gaps (code between definitions)
349struct GapTracker<'a> {
350    #[allow(dead_code)]
351    content: &'a str,
352    lines: Vec<&'a str>,
353    covered: Vec<bool>, // covered[i] = true if line i is part of a definition
354}
355
356impl<'a> GapTracker<'a> {
357    fn new(content: &'a str) -> Self {
358        let lines: Vec<&str> = content.lines().collect();
359        let covered = vec![false; lines.len()];
360
361        Self {
362            content,
363            lines,
364            covered,
365        }
366    }
367
368    /// Mark a range of lines as covered by a definition
369    fn mark_covered(&mut self, start_line: usize, end_line: usize) {
370        for i in start_line..=end_line.min(self.covered.len().saturating_sub(1)) {
371            if i < self.covered.len() {
372                self.covered[i] = true;
373            }
374        }
375    }
376
377    /// Extract gap chunks (uncovered regions)
378    fn extract_gaps(&self, path: &Path) -> Vec<Chunk> {
379        let mut gaps = Vec::new();
380        let path_str = normalize_path(path);
381        let context = vec![format!("File: {}", path_str)];
382
383        let mut gap_start: Option<usize> = None;
384
385        for (i, &is_covered) in self.covered.iter().enumerate() {
386            if !is_covered {
387                // Start or continue a gap
388                if gap_start.is_none() {
389                    gap_start = Some(i);
390                }
391            } else {
392                // End of gap
393                if let Some(start) = gap_start {
394                    // Extract gap content
395                    let gap_lines = &self.lines[start..i];
396                    let gap_content = gap_lines.join("\n");
397
398                    // Only create chunk if gap is not empty/whitespace
399                    if !gap_content.trim().is_empty() {
400                        let kind = Self::classify_gap(&gap_content);
401                        let line_count = i - start;
402                        let mut chunk = Chunk::new(gap_content, start, i, kind, path_str.clone());
403                        chunk.context = context.clone();
404                        chunk.signature = Some(Self::gap_signature(kind, line_count));
405                        gaps.push(chunk);
406                    }
407
408                    gap_start = None;
409                }
410            }
411        }
412
413        // Handle final gap (if file ends with gap)
414        if let Some(start) = gap_start {
415            let gap_lines = &self.lines[start..];
416            let gap_content = gap_lines.join("\n");
417
418            if !gap_content.trim().is_empty() {
419                let kind = Self::classify_gap(&gap_content);
420                let line_count = self.lines.len() - start;
421                let mut chunk =
422                    Chunk::new(gap_content, start, self.lines.len(), kind, path_str.clone());
423                chunk.context = context.clone();
424                chunk.signature = Some(Self::gap_signature(kind, line_count));
425                gaps.push(chunk);
426            }
427        }
428
429        gaps
430    }
431
432    /// Generate a descriptive signature for a gap chunk
433    fn gap_signature(kind: ChunkKind, line_count: usize) -> String {
434        match kind {
435            ChunkKind::Imports => format!("imports ({} lines)", line_count),
436            ChunkKind::ModuleDocs => format!("module docs ({} lines)", line_count),
437            ChunkKind::Comment => format!("comment block ({} lines)", line_count),
438            _ => format!("block ({} lines)", line_count),
439        }
440    }
441
442    /// Classify what kind of gap this is
443    fn classify_gap(content: &str) -> ChunkKind {
444        let trimmed = content.trim();
445        let total_lines = trimmed.lines().count();
446
447        // Check if it's mostly imports
448        let import_count = trimmed
449            .lines()
450            .filter(|line| {
451                let line = line.trim();
452                line.starts_with("import ")
453                    || line.starts_with("from ")
454                    || line.starts_with("use ")
455                    || line.starts_with("#include")
456            })
457            .count();
458
459        if total_lines > 0 && import_count > total_lines / 2 {
460            return ChunkKind::Imports;
461        }
462
463        // Check if it's module-level docs
464        if trimmed.starts_with("//!") || trimmed.starts_with("/*!") {
465            return ChunkKind::ModuleDocs;
466        }
467
468        // Check if it's mostly comments (single-line or block)
469        let comment_count = trimmed
470            .lines()
471            .filter(|line| {
472                let line = line.trim();
473                line.starts_with("//")
474                    || line.starts_with("/*")
475                    || line.starts_with("*")
476                    || line.starts_with("#")  // Python/Shell comments
477                    || line.is_empty() // Blank lines within comment blocks
478            })
479            .count();
480
481        if total_lines > 0 && comment_count > total_lines / 2 {
482            return ChunkKind::Comment;
483        }
484
485        ChunkKind::Block
486    }
487}
488
489#[cfg(test)]
490mod tests {
491    use super::*;
492
493    #[test]
494    fn test_semantic_chunker_creation() {
495        let chunker = SemanticChunker::new(100, 2000, 10);
496        assert_eq!(chunker.max_chunk_lines, 100);
497        assert_eq!(chunker.max_chunk_chars, 2000);
498        assert_eq!(chunker.overlap_lines, 10);
499    }
500
501    #[test]
502    fn test_chunk_rust_code() {
503        let mut chunker = SemanticChunker::new(100, 2000, 10);
504
505        let rust_code = r#"
506/// This is a doc comment
507fn hello_world() {
508    println!("Hello, world!");
509}
510
511fn add(a: i32, b: i32) -> i32 {
512    a + b
513}
514
515struct Point {
516    x: f64,
517    y: f64,
518}
519"#;
520
521        let path = Path::new("test.rs");
522        let chunks = chunker
523            .chunk_semantic(Language::Rust, path, rust_code)
524            .unwrap();
525
526        // Should have at least 3 definition chunks (2 functions + 1 struct)
527        assert!(
528            chunks.len() >= 3,
529            "Expected at least 3 chunks, got {}",
530            chunks.len()
531        );
532
533        // Check that we have function chunks
534        let function_chunks: Vec<_> = chunks
535            .iter()
536            .filter(|c| c.kind == ChunkKind::Function)
537            .collect();
538        assert!(
539            function_chunks.len() >= 2,
540            "Expected at least 2 function chunks"
541        );
542
543        // Check that first function has signature
544        let hello_chunk = function_chunks
545            .iter()
546            .find(|c| c.content.contains("hello_world"));
547        assert!(hello_chunk.is_some(), "Should find hello_world function");
548
549        if let Some(chunk) = hello_chunk {
550            assert!(chunk.signature.is_some(), "Should have signature");
551            assert!(chunk.signature.as_ref().unwrap().contains("fn hello_world"));
552        }
553    }
554
555    #[test]
556    fn test_chunk_python_code() {
557        let mut chunker = SemanticChunker::new(100, 2000, 10);
558
559        let python_code = r#"
560def hello():
561    """Say hello"""
562    print("Hello!")
563
564class Calculator:
565    """A simple calculator"""
566
567    def add(self, a, b):
568        """Add two numbers"""
569        return a + b
570"#;
571
572        let path = Path::new("test.py");
573        let chunks = chunker
574            .chunk_semantic(Language::Python, path, python_code)
575            .unwrap();
576
577        // Should have at least 2 chunks (function + class)
578        assert!(chunks.len() >= 2, "Expected at least 2 chunks");
579
580        // Check for docstrings
581        let chunks_with_docs: Vec<_> = chunks.iter().filter(|c| c.docstring.is_some()).collect();
582        assert!(
583            !chunks_with_docs.is_empty(),
584            "Should have chunks with docstrings"
585        );
586    }
587
588    #[test]
589    fn test_chunk_unsupported_language() {
590        let mut chunker = SemanticChunker::new(100, 2000, 10);
591
592        let content =
593            "Some random text file\nWith multiple lines\nThat should be chunked\nAs fallback";
594        let path = Path::new("test.txt");
595
596        let chunks = chunker
597            .chunk_semantic(Language::Unknown, path, content)
598            .unwrap();
599
600        // Should use fallback chunking
601        assert!(!chunks.is_empty());
602        assert!(chunks.iter().all(|c| c.kind == ChunkKind::Block));
603    }
604
605    #[test]
606    fn test_gap_tracking() {
607        let content = "line 0\nline 1\nline 2\nline 3\nline 4";
608        let mut tracker = GapTracker::new(content);
609
610        // Mark lines 1-2 as covered
611        tracker.mark_covered(1, 2);
612
613        // Should have gaps: [0], [3-4]
614        let path = Path::new("test.txt");
615        let gaps = tracker.extract_gaps(path);
616
617        assert_eq!(gaps.len(), 2, "Should have 2 gaps");
618        assert_eq!(gaps[0].start_line, 0);
619        assert_eq!(gaps[0].end_line, 1);
620        assert_eq!(gaps[1].start_line, 3);
621        assert_eq!(gaps[1].end_line, 5);
622    }
623
624    #[test]
625    fn test_chunk_splitting() {
626        let chunker = SemanticChunker::new(5, 100, 1); // Very small limit
627
628        let large_content = (0..20)
629            .map(|i| format!("line {}", i))
630            .collect::<Vec<_>>()
631            .join("\n");
632        let chunk = Chunk::new(
633            large_content,
634            0,
635            20,
636            ChunkKind::Function,
637            "test.rs".to_string(),
638        );
639
640        let splits = chunker.split_if_needed(chunk);
641
642        // Should be split into multiple chunks
643        assert!(splits.len() > 1, "Should split large chunk");
644
645        // All splits should be marked as incomplete
646        for split in &splits {
647            assert!(
648                !split.is_complete,
649                "Split chunks should be marked incomplete"
650            );
651            assert!(
652                split.split_index.is_some(),
653                "Split chunks should have index"
654            );
655        }
656    }
657
658    #[test]
659    fn test_context_breadcrumbs() {
660        let mut chunker = SemanticChunker::new(100, 2000, 10);
661
662        let rust_code = r#"
663impl MyStruct {
664    fn method(&self) {
665        println!("method");
666    }
667}
668"#;
669
670        let path = Path::new("test.rs");
671        let chunks = chunker
672            .chunk_semantic(Language::Rust, path, rust_code)
673            .unwrap();
674
675        // Find method chunk
676        let method_chunk = chunks.iter().find(|c| c.kind == ChunkKind::Method);
677
678        if let Some(chunk) = method_chunk {
679            // Should have context: File > Impl > Method
680            assert!(chunk.context.len() >= 2, "Should have nested context");
681            assert!(chunk.context[0].contains("File:"));
682        }
683    }
684}