Skip to main content

codesearch/chunker/
mod.rs

1#![allow(dead_code)]
2
3use anyhow::Result;
4use sha2::{Digest, Sha256};
5use std::path::Path;
6
7mod dedup;
8mod extractor;
9mod fallback;
10mod grammar;
11mod parser;
12mod semantic;
13mod tree_sitter;
14
15pub use semantic::SemanticChunker;
16
17/// Default number of context lines before/after a chunk
18pub const DEFAULT_CONTEXT_LINES: usize = 3;
19
20/// Represents a chunk of code with metadata
21#[derive(Debug, Clone)]
22pub struct Chunk {
23    /// The actual content of the chunk
24    pub content: String,
25
26    /// Starting line number (0-indexed)
27    pub start_line: usize,
28
29    /// Ending line number (0-indexed)
30    pub end_line: usize,
31
32    /// Type of chunk
33    pub kind: ChunkKind,
34
35    /// Context breadcrumbs (e.g., ["File: main.rs", "Class: Server", "Function: handle_request"])
36    pub context: Vec<String>,
37
38    /// File path this chunk belongs to
39    pub path: String,
40
41    /// Function/method signature (if applicable)
42    /// Example: "fn sort<T: Ord>(items: Vec<T>) -> Vec<T>"
43    pub signature: Option<String>,
44
45    /// Extracted docstring/documentation comment
46    pub docstring: Option<String>,
47
48    /// Whether this chunk is complete (not split)
49    pub is_complete: bool,
50
51    /// If this chunk was split, which part is it? (0, 1, 2...)
52    pub split_index: Option<usize>,
53
54    /// Content hash for deduplication
55    pub hash: String,
56
57    /// Lines of code immediately before this chunk (for context)
58    pub context_prev: Option<String>,
59
60    /// Lines of code immediately after this chunk (for context)
61    pub context_next: Option<String>,
62}
63
64impl Chunk {
65    /// Create a new chunk with basic information
66    pub fn new(
67        content: String,
68        start_line: usize,
69        end_line: usize,
70        kind: ChunkKind,
71        path: String,
72    ) -> Self {
73        let hash = Self::compute_hash(&content);
74
75        Self {
76            content,
77            start_line,
78            end_line,
79            kind,
80            context: Vec::new(),
81            path,
82            signature: None,
83            docstring: None,
84            is_complete: true,
85            split_index: None,
86            hash,
87            context_prev: None,
88            context_next: None,
89        }
90    }
91
92    /// Compute SHA-256 hash of content for deduplication
93    pub fn compute_hash(content: &str) -> String {
94        let mut hasher = Sha256::new();
95        hasher.update(content.as_bytes());
96        format!("{:x}", hasher.finalize())
97    }
98
99    /// TEST METHOD: Estimate memory usage of this chunk in bytes
100    pub fn estimate_memory_usage(&self) -> usize {
101        let content_size = self.content.len();
102        let context_size = self.context.iter().map(|s| s.len()).sum::<usize>();
103        let signature_size = self.signature.as_ref().map_or(0, |s| s.len());
104        let docstring_size = self.docstring.as_ref().map_or(0, |s| s.len());
105        let context_prev_size = self.context_prev.as_ref().map_or(0, |s| s.len());
106        let context_next_size = self.context_next.as_ref().map_or(0, |s| s.len());
107
108        content_size
109            + context_size
110            + signature_size
111            + docstring_size
112            + context_prev_size
113            + context_next_size
114    }
115
116    /// TEST METHOD: Check if this chunk contains a specific keyword
117    pub fn contains_keyword(&self, keyword: &str) -> bool {
118        self.content.contains(keyword)
119            || self.signature.as_ref().is_some_and(|s| s.contains(keyword))
120            || self.docstring.as_ref().is_some_and(|s| s.contains(keyword))
121    }
122
123    /// Check if this chunk is likely a duplicate based on hash
124    pub fn is_duplicate_of(&self, other: &Chunk) -> bool {
125        self.hash == other.hash
126    }
127
128    /// Get the number of lines in this chunk
129    pub fn line_count(&self) -> usize {
130        self.end_line.saturating_sub(self.start_line)
131    }
132
133    /// Get the size of this chunk in bytes
134    pub fn size_bytes(&self) -> usize {
135        self.content.len()
136    }
137}
138
139#[derive(Debug, Clone, Copy, PartialEq, Eq)]
140pub enum ChunkKind {
141    Function,   // Standalone function
142    Class,      // Class definition (non-Rust languages)
143    Method,     // Method within class/impl
144    Struct,     // Struct definition (Rust)
145    Enum,       // Enum definition
146    Trait,      // Trait definition (Rust)
147    Interface,  // Interface (TypeScript, Java)
148    Impl,       // Impl block (Rust)
149    Mod,        // Module definition
150    TypeAlias,  // Type alias
151    Const,      // Constant
152    Static,     // Static variable
153    Block,      // Gap/unstructured code
154    Anchor,     // File-level summary chunk
155    Comment,    // Standalone comment block (gap between definitions)
156    Imports,    // Import/use statements block
157    ModuleDocs, // Module-level documentation (//!, /*!)
158    Other,      // Catch-all
159}
160
161/// Trait for chunking strategies
162pub trait Chunker: Send + Sync {
163    /// Chunk a file into semantic pieces
164    fn chunk_file(&self, path: &Path, content: &str) -> Result<Vec<Chunk>>;
165}
166
167#[cfg(test)]
168mod tests {
169    #[allow(unused_imports)]
170    use super::*;
171
172    #[test]
173    fn test_chunker() {
174        // TODO: Add tests
175    }
176}