1use crate::error::Result;
2use crate::types::{ChunkingStrategy, ScraperConfig};
3use sha2::{Sha256, Digest};
4
5pub struct Chunker {
7 config: ScraperConfig,
8}
9
10#[derive(Debug, Clone)]
12pub struct Chunk {
13 pub id: String, pub content: String,
15 pub start_line: usize,
16 pub end_line: usize,
17 pub language: Option<String>,
18 pub concepts: Vec<String>,
19 pub parent_id: Option<String>,
20 pub children: Vec<String>,
21 pub source_file: String,
22}
23
24impl Chunker {
25 pub fn new(config: ScraperConfig) -> Self {
26 Self { config }
27 }
28
29 pub fn chunk(
31 &self,
32 content: &str,
33 language: Option<&str>,
34 source_file: &str,
35 ) -> Result<Vec<Chunk>> {
36 match self.config.chunking_strategy {
37 ChunkingStrategy::ByFile => self.chunk_by_file(content, language, source_file),
38 ChunkingStrategy::Semantic => self.chunk_semantic(content, language, source_file),
39 ChunkingStrategy::FixedSize => self.chunk_fixed_size(content, language, source_file),
40 ChunkingStrategy::Hierarchical => {
41 self.chunk_hierarchical(content, language, source_file)
42 }
43 ChunkingStrategy::ByLineCount => self.chunk_by_lines(content, language, source_file),
44 }
45 }
46
47 fn chunk_by_file(
49 &self,
50 content: &str,
51 language: Option<&str>,
52 source_file: &str,
53 ) -> Result<Vec<Chunk>> {
54 let id = self.compute_hash(content);
55 let line_count = content.lines().count();
56
57 Ok(vec![Chunk {
58 id,
59 content: content.to_string(),
60 start_line: 0,
61 end_line: line_count,
62 language: language.map(String::from),
63 concepts: extract_concepts(content, language),
64 parent_id: None,
65 children: Vec::new(),
66 source_file: source_file.to_string(),
67 }])
68 }
69
70 fn chunk_semantic(
72 &self,
73 content: &str,
74 language: Option<&str>,
75 source_file: &str,
76 ) -> Result<Vec<Chunk>> {
77 let boundaries = self.find_semantic_boundaries(content, language)?;
78
79 if boundaries.is_empty() {
80 return self.chunk_by_file(content, language, source_file);
81 }
82
83 let mut chunks = Vec::new();
84 let lines: Vec<&str> = content.lines().collect();
85
86 for (i, boundary) in boundaries.iter().enumerate() {
87 let start = if i == 0 { 0 } else { boundaries[i - 1] };
88 let end = *boundary;
89
90 if start >= end {
91 continue;
92 }
93
94 let chunk_content = lines[start..end].join("\n");
95 if chunk_content.trim().is_empty() {
96 continue;
97 }
98
99 let id = self.compute_hash(&chunk_content);
100 let mut chunk = Chunk {
101 id,
102 content: chunk_content,
103 start_line: start,
104 end_line: end,
105 language: language.map(String::from),
106 concepts: extract_concepts(&lines[start..end].join("\n"), language),
107 parent_id: None,
108 children: Vec::new(),
109 source_file: source_file.to_string(),
110 };
111
112 if self.config.include_overlap && i > 0 {
114 let overlap_start =
115 (start.saturating_sub(self.config.overlap_size / 100)).max(0);
116 let overlap_lines = &lines[overlap_start..start];
117 let overlap_content = overlap_lines.join("\n");
118 chunk.content = format!("{}\n{}", overlap_content, chunk.content);
119 }
120
121 chunks.push(chunk);
122 }
123
124 Ok(chunks)
125 }
126
127 fn chunk_fixed_size(
129 &self,
130 content: &str,
131 language: Option<&str>,
132 source_file: &str,
133 ) -> Result<Vec<Chunk>> {
134 let mut chunks = Vec::new();
135 let lines: Vec<&str> = content.lines().collect();
136 let lines_per_chunk = (self.config.max_chunk_size / 80).max(10); for (i, chunk_lines) in lines.chunks(lines_per_chunk).enumerate() {
139 let chunk_content = chunk_lines.join("\n");
140 let id = self.compute_hash(&chunk_content);
141
142 chunks.push(Chunk {
143 id,
144 content: chunk_content,
145 start_line: i * lines_per_chunk,
146 end_line: (i + 1) * lines_per_chunk,
147 language: language.map(String::from),
148 concepts: extract_concepts(&lines.join("\n"), language),
149 parent_id: None,
150 children: Vec::new(),
151 source_file: source_file.to_string(),
152 });
153 }
154
155 Ok(chunks)
156 }
157
158 fn chunk_hierarchical(
160 &self,
161 content: &str,
162 language: Option<&str>,
163 source_file: &str,
164 ) -> Result<Vec<Chunk>> {
165 let parent_id = self.compute_hash(content);
167 let mut chunks = vec![Chunk {
168 id: parent_id.clone(),
169 content: content.to_string(),
170 start_line: 0,
171 end_line: content.lines().count(),
172 language: language.map(String::from),
173 concepts: extract_concepts(content, language),
174 parent_id: None,
175 children: Vec::new(),
176 source_file: source_file.to_string(),
177 }];
178
179 let mut children_chunks = self.chunk_semantic(content, language, source_file)?;
181 for child in &mut children_chunks {
182 child.parent_id = Some(parent_id.clone());
183 }
184
185 chunks.extend(children_chunks);
186 Ok(chunks)
187 }
188
189 fn chunk_by_lines(
191 &self,
192 content: &str,
193 language: Option<&str>,
194 source_file: &str,
195 ) -> Result<Vec<Chunk>> {
196 let lines_per_chunk = 100; self.chunk_by_custom_line_count(content, language, source_file, lines_per_chunk)
198 }
199
200 fn chunk_by_custom_line_count(
201 &self,
202 content: &str,
203 language: Option<&str>,
204 source_file: &str,
205 lines_per_chunk: usize,
206 ) -> Result<Vec<Chunk>> {
207 let mut chunks = Vec::new();
208 let lines: Vec<&str> = content.lines().collect();
209
210 for (i, chunk_lines) in lines.chunks(lines_per_chunk).enumerate() {
211 let chunk_content = chunk_lines.join("\n");
212 let id = self.compute_hash(&chunk_content);
213 let concepts = extract_concepts(&chunk_content, language);
214
215 chunks.push(Chunk {
216 id,
217 content: chunk_content,
218 start_line: i * lines_per_chunk,
219 end_line: std::cmp::min((i + 1) * lines_per_chunk, lines.len()),
220 language: language.map(String::from),
221 concepts,
222 parent_id: None,
223 children: Vec::new(),
224 source_file: source_file.to_string(),
225 });
226 }
227
228 Ok(chunks)
229 }
230
231 fn find_semantic_boundaries(&self, content: &str, language: Option<&str>) -> Result<Vec<usize>> {
233 let mut boundaries = Vec::new();
234 let lines: Vec<&str> = content.lines().collect();
235
236 match language {
237 Some("rust") => {
238 for (i, line) in lines.iter().enumerate() {
239 let trimmed = line.trim();
240 if trimmed.starts_with("fn ") || trimmed.starts_with("pub fn ")
241 || trimmed.starts_with("struct ") || trimmed.starts_with("pub struct ")
242 || trimmed.starts_with("impl ") || trimmed.starts_with("pub impl ")
243 || trimmed.starts_with("trait ") || trimmed.starts_with("pub trait ")
244 {
245 boundaries.push(i);
246 }
247 }
248 }
249 Some("typescript") | Some("javascript") => {
250 for (i, line) in lines.iter().enumerate() {
251 let trimmed = line.trim();
252 if trimmed.starts_with("function ") || trimmed.starts_with("export function ")
253 || trimmed.starts_with("class ") || trimmed.starts_with("export class ")
254 || trimmed.starts_with("interface ") || trimmed.starts_with("export interface ")
255 || (trimmed.starts_with("const ") && trimmed.contains("=>"))
256 {
257 boundaries.push(i);
258 }
259 }
260 }
261 Some("python") => {
262 for (i, line) in lines.iter().enumerate() {
263 if !line.starts_with(' ') && (line.starts_with("def ") || line.starts_with("class ")) {
264 boundaries.push(i);
265 }
266 }
267 }
268 _ => {}
269 }
270
271 Ok(boundaries)
272 }
273
274 pub fn compute_hash(&self, content: &str) -> String {
276 let mut hasher = Sha256::new();
277 hasher.update(content.as_bytes());
278 let result = hasher.finalize();
279 format!("chunk:{}", hex::encode(result))
280 }
281}
282
283fn extract_concepts(content: &str, language: Option<&str>) -> Vec<String> {
285 let mut concepts = Vec::new();
286
287 match language {
288 Some("rust") => {
289 if content.contains("async") {
290 concepts.push("async".to_string());
291 }
292 if content.contains("trait") {
293 concepts.push("trait".to_string());
294 }
295 if content.contains("macro") {
296 concepts.push("macro".to_string());
297 }
298 if content.contains("unsafe") {
299 concepts.push("unsafe".to_string());
300 }
301 }
302 Some("typescript") | Some("javascript") => {
303 if content.contains("async") {
304 concepts.push("async".to_string());
305 }
306 if content.contains("class") {
307 concepts.push("oop".to_string());
308 }
309 if content.contains("react") || content.contains("React") {
310 concepts.push("react".to_string());
311 }
312 if content.contains("@") {
313 concepts.push("decorators".to_string());
314 }
315 }
316 Some("python") => {
317 if content.contains("async") {
318 concepts.push("async".to_string());
319 }
320 if content.contains("@") {
321 concepts.push("decorators".to_string());
322 }
323 if content.contains("class") {
324 concepts.push("oop".to_string());
325 }
326 }
327 _ => {}
328 }
329
330 concepts
331}
332
333#[cfg(test)]
334mod tests {
335 use super::*;
336
337 #[test]
338 fn test_hash_consistency() {
339 let chunker = Chunker::new(ScraperConfig::default());
340 let content = "fn hello() {}";
341 let hash1 = chunker.compute_hash(content);
342 let hash2 = chunker.compute_hash(content);
343 assert_eq!(hash1, hash2);
344 assert!(hash1.starts_with("chunk:"));
345 }
346}