cadi_scraper/
chunker.rs

1use crate::error::Result;
2use crate::types::{ChunkingStrategy, ScraperConfig};
3use sha2::{Sha256, Digest};
4
5/// Chunker that splits content into semantic or fixed-size chunks
6pub struct Chunker {
7    config: ScraperConfig,
8}
9
10/// Represents a single chunk with metadata
11#[derive(Debug, Clone)]
12pub struct Chunk {
13    pub id: String,                      // Content hash
14    pub content: String,
15    pub start_line: usize,
16    pub end_line: usize,
17    pub language: Option<String>,
18    pub concepts: Vec<String>,
19    pub parent_id: Option<String>,
20    pub children: Vec<String>,
21    pub source_file: String,
22}
23
24impl Chunker {
25    pub fn new(config: ScraperConfig) -> Self {
26        Self { config }
27    }
28
29    /// Chunk content based on the configured strategy
30    pub fn chunk(
31        &self,
32        content: &str,
33        language: Option<&str>,
34        source_file: &str,
35    ) -> Result<Vec<Chunk>> {
36        match self.config.chunking_strategy {
37            ChunkingStrategy::ByFile => self.chunk_by_file(content, language, source_file),
38            ChunkingStrategy::Semantic => self.chunk_semantic(content, language, source_file),
39            ChunkingStrategy::FixedSize => self.chunk_fixed_size(content, language, source_file),
40            ChunkingStrategy::Hierarchical => {
41                self.chunk_hierarchical(content, language, source_file)
42            }
43            ChunkingStrategy::ByLineCount => self.chunk_by_lines(content, language, source_file),
44        }
45    }
46
47    /// Return entire content as single chunk
48    fn chunk_by_file(
49        &self,
50        content: &str,
51        language: Option<&str>,
52        source_file: &str,
53    ) -> Result<Vec<Chunk>> {
54        let id = self.compute_hash(content);
55        let line_count = content.lines().count();
56
57        Ok(vec![Chunk {
58            id,
59            content: content.to_string(),
60            start_line: 0,
61            end_line: line_count,
62            language: language.map(String::from),
63            concepts: extract_concepts(content, language),
64            parent_id: None,
65            children: Vec::new(),
66            source_file: source_file.to_string(),
67        }])
68    }
69
70    /// Chunk by semantic boundaries (functions, classes, etc)
71    fn chunk_semantic(
72        &self,
73        content: &str,
74        language: Option<&str>,
75        source_file: &str,
76    ) -> Result<Vec<Chunk>> {
77        let boundaries = self.find_semantic_boundaries(content, language)?;
78
79        if boundaries.is_empty() {
80            return self.chunk_by_file(content, language, source_file);
81        }
82
83        let mut chunks = Vec::new();
84        let lines: Vec<&str> = content.lines().collect();
85
86        for (i, boundary) in boundaries.iter().enumerate() {
87            let start = if i == 0 { 0 } else { boundaries[i - 1] };
88            let end = *boundary;
89
90            if start >= end {
91                continue;
92            }
93
94            let chunk_content = lines[start..end].join("\n");
95            if chunk_content.trim().is_empty() {
96                continue;
97            }
98
99            let id = self.compute_hash(&chunk_content);
100            let mut chunk = Chunk {
101                id,
102                content: chunk_content,
103                start_line: start,
104                end_line: end,
105                language: language.map(String::from),
106                concepts: extract_concepts(&lines[start..end].join("\n"), language),
107                parent_id: None,
108                children: Vec::new(),
109                source_file: source_file.to_string(),
110            };
111
112            // Add overlap if enabled
113            if self.config.include_overlap && i > 0 {
114                let overlap_start =
115                    (start.saturating_sub(self.config.overlap_size / 100)).max(0);
116                let overlap_lines = &lines[overlap_start..start];
117                let overlap_content = overlap_lines.join("\n");
118                chunk.content = format!("{}\n{}", overlap_content, chunk.content);
119            }
120
121            chunks.push(chunk);
122        }
123
124        Ok(chunks)
125    }
126
127    /// Chunk content into fixed sizes
128    fn chunk_fixed_size(
129        &self,
130        content: &str,
131        language: Option<&str>,
132        source_file: &str,
133    ) -> Result<Vec<Chunk>> {
134        let mut chunks = Vec::new();
135        let lines: Vec<&str> = content.lines().collect();
136        let lines_per_chunk = (self.config.max_chunk_size / 80).max(10); // Rough estimate
137
138        for (i, chunk_lines) in lines.chunks(lines_per_chunk).enumerate() {
139            let chunk_content = chunk_lines.join("\n");
140            let id = self.compute_hash(&chunk_content);
141
142            chunks.push(Chunk {
143                id,
144                content: chunk_content,
145                start_line: i * lines_per_chunk,
146                end_line: (i + 1) * lines_per_chunk,
147                language: language.map(String::from),
148                concepts: extract_concepts(&lines.join("\n"), language),
149                parent_id: None,
150                children: Vec::new(),
151                source_file: source_file.to_string(),
152            });
153        }
154
155        Ok(chunks)
156    }
157
158    /// Hierarchical chunking with parent-child relationships
159    fn chunk_hierarchical(
160        &self,
161        content: &str,
162        language: Option<&str>,
163        source_file: &str,
164    ) -> Result<Vec<Chunk>> {
165        // First create file-level chunk as parent
166        let parent_id = self.compute_hash(content);
167        let mut chunks = vec![Chunk {
168            id: parent_id.clone(),
169            content: content.to_string(),
170            start_line: 0,
171            end_line: content.lines().count(),
172            language: language.map(String::from),
173            concepts: extract_concepts(content, language),
174            parent_id: None,
175            children: Vec::new(),
176            source_file: source_file.to_string(),
177        }];
178
179        // Then create semantic sub-chunks as children
180        let mut children_chunks = self.chunk_semantic(content, language, source_file)?;
181        for child in &mut children_chunks {
182            child.parent_id = Some(parent_id.clone());
183        }
184
185        chunks.extend(children_chunks);
186        Ok(chunks)
187    }
188
189    /// Chunk by fixed line count
190    fn chunk_by_lines(
191        &self,
192        content: &str,
193        language: Option<&str>,
194        source_file: &str,
195    ) -> Result<Vec<Chunk>> {
196        let lines_per_chunk = 100; // Default 100 lines per chunk
197        self.chunk_by_custom_line_count(content, language, source_file, lines_per_chunk)
198    }
199
200    fn chunk_by_custom_line_count(
201        &self,
202        content: &str,
203        language: Option<&str>,
204        source_file: &str,
205        lines_per_chunk: usize,
206    ) -> Result<Vec<Chunk>> {
207        let mut chunks = Vec::new();
208        let lines: Vec<&str> = content.lines().collect();
209
210        for (i, chunk_lines) in lines.chunks(lines_per_chunk).enumerate() {
211            let chunk_content = chunk_lines.join("\n");
212            let id = self.compute_hash(&chunk_content);
213            let concepts = extract_concepts(&chunk_content, language);
214
215            chunks.push(Chunk {
216                id,
217                content: chunk_content,
218                start_line: i * lines_per_chunk,
219                end_line: std::cmp::min((i + 1) * lines_per_chunk, lines.len()),
220                language: language.map(String::from),
221                concepts,
222                parent_id: None,
223                children: Vec::new(),
224                source_file: source_file.to_string(),
225            });
226        }
227
228        Ok(chunks)
229    }
230
231    /// Find semantic boundaries in code
232    fn find_semantic_boundaries(&self, content: &str, language: Option<&str>) -> Result<Vec<usize>> {
233        let mut boundaries = Vec::new();
234        let lines: Vec<&str> = content.lines().collect();
235
236        match language {
237            Some("rust") => {
238                for (i, line) in lines.iter().enumerate() {
239                    let trimmed = line.trim();
240                    if trimmed.starts_with("fn ") || trimmed.starts_with("pub fn ")
241                        || trimmed.starts_with("struct ") || trimmed.starts_with("pub struct ")
242                        || trimmed.starts_with("impl ") || trimmed.starts_with("pub impl ")
243                        || trimmed.starts_with("trait ") || trimmed.starts_with("pub trait ")
244                    {
245                        boundaries.push(i);
246                    }
247                }
248            }
249            Some("typescript") | Some("javascript") => {
250                for (i, line) in lines.iter().enumerate() {
251                    let trimmed = line.trim();
252                    if trimmed.starts_with("function ") || trimmed.starts_with("export function ")
253                        || trimmed.starts_with("class ") || trimmed.starts_with("export class ")
254                        || trimmed.starts_with("interface ") || trimmed.starts_with("export interface ")
255                        || (trimmed.starts_with("const ") && trimmed.contains("=>"))
256                    {
257                        boundaries.push(i);
258                    }
259                }
260            }
261            Some("python") => {
262                for (i, line) in lines.iter().enumerate() {
263                    if !line.starts_with(' ') && (line.starts_with("def ") || line.starts_with("class ")) {
264                        boundaries.push(i);
265                    }
266                }
267            }
268            _ => {}
269        }
270
271        Ok(boundaries)
272    }
273
274    /// Compute SHA256 hash of content
275    pub fn compute_hash(&self, content: &str) -> String {
276        let mut hasher = Sha256::new();
277        hasher.update(content.as_bytes());
278        let result = hasher.finalize();
279        format!("chunk:{}", hex::encode(result))
280    }
281}
282
283/// Extract key concepts from content
284fn extract_concepts(content: &str, language: Option<&str>) -> Vec<String> {
285    let mut concepts = Vec::new();
286
287    match language {
288        Some("rust") => {
289            if content.contains("async") {
290                concepts.push("async".to_string());
291            }
292            if content.contains("trait") {
293                concepts.push("trait".to_string());
294            }
295            if content.contains("macro") {
296                concepts.push("macro".to_string());
297            }
298            if content.contains("unsafe") {
299                concepts.push("unsafe".to_string());
300            }
301        }
302        Some("typescript") | Some("javascript") => {
303            if content.contains("async") {
304                concepts.push("async".to_string());
305            }
306            if content.contains("class") {
307                concepts.push("oop".to_string());
308            }
309            if content.contains("react") || content.contains("React") {
310                concepts.push("react".to_string());
311            }
312            if content.contains("@") {
313                concepts.push("decorators".to_string());
314            }
315        }
316        Some("python") => {
317            if content.contains("async") {
318                concepts.push("async".to_string());
319            }
320            if content.contains("@") {
321                concepts.push("decorators".to_string());
322            }
323            if content.contains("class") {
324                concepts.push("oop".to_string());
325            }
326        }
327        _ => {}
328    }
329
330    concepts
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    #[test]
338    fn test_hash_consistency() {
339        let chunker = Chunker::new(ScraperConfig::default());
340        let content = "fn hello() {}";
341        let hash1 = chunker.compute_hash(content);
342        let hash2 = chunker.compute_hash(content);
343        assert_eq!(hash1, hash2);
344        assert!(hash1.starts_with("chunk:"));
345    }
346}