aurora_semantic/chunker/
mod.rs

1//! Code chunking module for extracting meaningful code segments.
2//!
3//! This module provides functionality to parse source code and extract
4//! semantically meaningful chunks (functions, classes, etc.).
5
6mod strategies;
7
8use crate::config::ChunkingConfig;
9use crate::error::Result;
10use crate::types::{Chunk, ChunkId, ChunkType, Document, Language};
11
12/// Trait for code chunking implementations.
13pub trait Chunker: Send + Sync {
14    /// Extract chunks from source code content.
15    fn chunk(&self, document: &Document, content: &str) -> Result<Vec<Chunk>>;
16
17    /// Get the name of this chunker.
18    fn name(&self) -> &'static str;
19}
20
21/// Default chunker using regex-based parsing.
22pub struct DefaultChunker {
23    config: ChunkingConfig,
24}
25
26impl DefaultChunker {
27    /// Create a new default chunker with the given configuration.
28    pub fn new(config: ChunkingConfig) -> Result<Self> {
29        Ok(Self { config })
30    }
31}
32
33impl Chunker for DefaultChunker {
34    fn chunk(&self, document: &Document, content: &str) -> Result<Vec<Chunk>> {
35        // Skip if content is too small
36        if content.len() < self.config.min_chunk_size {
37            return Ok(vec![]);
38        }
39
40        // Use semantic chunking based on language patterns
41        let chunks = extract_chunks_by_language(document, content, &self.config);
42
43        // If we got no chunks, fall back to line-based chunking
44        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
45            return Ok(vec![Chunk {
46                id: ChunkId::new(),
47                document_id: document.id.clone(),
48                content: content.to_string(),
49                chunk_type: ChunkType::Block,
50                start_line: 1,
51                end_line: content.lines().count() as u32,
52                start_byte: 0,
53                end_byte: content.len(),
54                symbol_name: None,
55                parent_symbol: None,
56            }]);
57        }
58
59        Ok(chunks)
60    }
61
62    fn name(&self) -> &'static str {
63        "default"
64    }
65}
66
67/// Extract chunks based on language-specific patterns.
68fn extract_chunks_by_language(
69    document: &Document,
70    content: &str,
71    config: &ChunkingConfig,
72) -> Vec<Chunk> {
73    match document.language {
74        Language::Rust => extract_rust_chunks(document, content, config),
75        Language::Python => extract_python_chunks(document, content, config),
76        Language::JavaScript | Language::TypeScript => {
77            extract_js_chunks(document, content, config)
78        }
79        Language::Go => extract_go_chunks(document, content, config),
80        Language::Java => extract_java_chunks(document, content, config),
81        _ => extract_generic_chunks(document, content, config),
82    }
83}
84
85/// Extract Rust functions/structs/impls.
86fn extract_rust_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
87    let mut chunks = Vec::new();
88    let lines: Vec<&str> = content.lines().collect();
89
90    let mut i = 0;
91    while i < lines.len() {
92        let line = lines[i].trim();
93
94        // Detect function, struct, enum, impl, trait, mod
95        let (chunk_type, name) = if line.starts_with("pub fn ")
96            || line.starts_with("fn ")
97            || line.starts_with("pub async fn ")
98            || line.starts_with("async fn ")
99        {
100            (ChunkType::Function, extract_name_after(line, "fn "))
101        } else if line.starts_with("pub struct ") || line.starts_with("struct ") {
102            (ChunkType::Struct, extract_name_after(line, "struct "))
103        } else if line.starts_with("pub enum ") || line.starts_with("enum ") {
104            (ChunkType::Enum, extract_name_after(line, "enum "))
105        } else if line.starts_with("impl ") {
106            (ChunkType::Implementation, extract_impl_name(line))
107        } else if line.starts_with("pub trait ") || line.starts_with("trait ") {
108            (ChunkType::Interface, extract_name_after(line, "trait "))
109        } else if line.starts_with("mod ") || line.starts_with("pub mod ") {
110            (ChunkType::Module, extract_name_after(line, "mod "))
111        } else {
112            i += 1;
113            continue;
114        };
115
116        // Find the end of the block
117        let start_line = i;
118        let end_line = find_block_end(&lines, i);
119
120        let chunk_content: String = lines[start_line..=end_line].join("\n");
121
122        if chunk_content.len() >= config.min_chunk_size
123            && chunk_content.len() <= config.max_chunk_size
124        {
125            let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
126            let end_byte = start_byte + chunk_content.len();
127
128            chunks.push(Chunk {
129                id: ChunkId::new(),
130                document_id: document.id.clone(),
131                content: chunk_content,
132                chunk_type,
133                start_line: start_line as u32 + 1,
134                end_line: end_line as u32 + 1,
135                start_byte,
136                end_byte,
137                symbol_name: name,
138                parent_symbol: None,
139            });
140        }
141
142        i = end_line + 1;
143    }
144
145    chunks
146}
147
148/// Extract Python functions/classes.
149fn extract_python_chunks(
150    document: &Document,
151    content: &str,
152    config: &ChunkingConfig,
153) -> Vec<Chunk> {
154    let mut chunks = Vec::new();
155    let lines: Vec<&str> = content.lines().collect();
156
157    let mut i = 0;
158    while i < lines.len() {
159        let line = lines[i];
160        let trimmed = line.trim();
161
162        let (chunk_type, name) = if trimmed.starts_with("def ")
163            || trimmed.starts_with("async def ")
164        {
165            (ChunkType::Function, extract_name_after(trimmed, "def "))
166        } else if trimmed.starts_with("class ") {
167            (ChunkType::Class, extract_name_after(trimmed, "class "))
168        } else {
169            i += 1;
170            continue;
171        };
172
173        // Find indentation level
174        let indent = line.len() - line.trim_start().len();
175        let start_line = i;
176        let end_line = find_python_block_end(&lines, i, indent);
177
178        let chunk_content: String = lines[start_line..=end_line].join("\n");
179
180        if chunk_content.len() >= config.min_chunk_size
181            && chunk_content.len() <= config.max_chunk_size
182        {
183            let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
184            let end_byte = start_byte + chunk_content.len();
185
186            chunks.push(Chunk {
187                id: ChunkId::new(),
188                document_id: document.id.clone(),
189                content: chunk_content,
190                chunk_type,
191                start_line: start_line as u32 + 1,
192                end_line: end_line as u32 + 1,
193                start_byte,
194                end_byte,
195                symbol_name: name,
196                parent_symbol: None,
197            });
198        }
199
200        i = end_line + 1;
201    }
202
203    chunks
204}
205
206/// Extract JavaScript/TypeScript functions/classes.
207fn extract_js_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
208    let mut chunks = Vec::new();
209    let lines: Vec<&str> = content.lines().collect();
210
211    let mut i = 0;
212    while i < lines.len() {
213        let line = lines[i].trim();
214
215        let (chunk_type, name) = if line.starts_with("function ")
216            || line.starts_with("async function ")
217            || line.starts_with("export function ")
218            || line.starts_with("export async function ")
219        {
220            (ChunkType::Function, extract_name_after(line, "function "))
221        } else if line.starts_with("class ") || line.starts_with("export class ") {
222            (ChunkType::Class, extract_name_after(line, "class "))
223        } else if line.starts_with("interface ") || line.starts_with("export interface ") {
224            (ChunkType::Interface, extract_name_after(line, "interface "))
225        } else if line.contains("=>") && (line.starts_with("const ") || line.starts_with("export const ")) {
226            (ChunkType::Function, extract_name_after(line, "const "))
227        } else {
228            i += 1;
229            continue;
230        };
231
232        let start_line = i;
233        let end_line = find_block_end(&lines, i);
234
235        let chunk_content: String = lines[start_line..=end_line].join("\n");
236
237        if chunk_content.len() >= config.min_chunk_size
238            && chunk_content.len() <= config.max_chunk_size
239        {
240            let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
241            let end_byte = start_byte + chunk_content.len();
242
243            chunks.push(Chunk {
244                id: ChunkId::new(),
245                document_id: document.id.clone(),
246                content: chunk_content,
247                chunk_type,
248                start_line: start_line as u32 + 1,
249                end_line: end_line as u32 + 1,
250                start_byte,
251                end_byte,
252                symbol_name: name,
253                parent_symbol: None,
254            });
255        }
256
257        i = end_line + 1;
258    }
259
260    chunks
261}
262
263/// Extract Go functions/structs.
264fn extract_go_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
265    let mut chunks = Vec::new();
266    let lines: Vec<&str> = content.lines().collect();
267
268    let mut i = 0;
269    while i < lines.len() {
270        let line = lines[i].trim();
271
272        let (chunk_type, name) = if line.starts_with("func ") {
273            (ChunkType::Function, extract_go_func_name(line))
274        } else if line.starts_with("type ") && line.contains("struct") {
275            (ChunkType::Struct, extract_name_after(line, "type "))
276        } else if line.starts_with("type ") && line.contains("interface") {
277            (ChunkType::Interface, extract_name_after(line, "type "))
278        } else {
279            i += 1;
280            continue;
281        };
282
283        let start_line = i;
284        let end_line = find_block_end(&lines, i);
285
286        let chunk_content: String = lines[start_line..=end_line].join("\n");
287
288        if chunk_content.len() >= config.min_chunk_size
289            && chunk_content.len() <= config.max_chunk_size
290        {
291            let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
292            let end_byte = start_byte + chunk_content.len();
293
294            chunks.push(Chunk {
295                id: ChunkId::new(),
296                document_id: document.id.clone(),
297                content: chunk_content,
298                chunk_type,
299                start_line: start_line as u32 + 1,
300                end_line: end_line as u32 + 1,
301                start_byte,
302                end_byte,
303                symbol_name: name,
304                parent_symbol: None,
305            });
306        }
307
308        i = end_line + 1;
309    }
310
311    chunks
312}
313
314/// Extract Java methods/classes.
315fn extract_java_chunks(document: &Document, content: &str, config: &ChunkingConfig) -> Vec<Chunk> {
316    let mut chunks = Vec::new();
317    let lines: Vec<&str> = content.lines().collect();
318
319    let mut i = 0;
320    while i < lines.len() {
321        let line = lines[i].trim();
322
323        let (chunk_type, name) =
324            if line.contains("class ") && (line.starts_with("public") || line.starts_with("class"))
325            {
326                (ChunkType::Class, extract_name_after(line, "class "))
327            } else if line.contains("interface ")
328                && (line.starts_with("public") || line.starts_with("interface"))
329            {
330                (ChunkType::Interface, extract_name_after(line, "interface "))
331            } else if (line.starts_with("public ")
332                || line.starts_with("private ")
333                || line.starts_with("protected "))
334                && line.contains("(")
335                && !line.contains("class")
336            {
337                (ChunkType::Function, extract_java_method_name(line))
338            } else {
339                i += 1;
340                continue;
341            };
342
343        let start_line = i;
344        let end_line = find_block_end(&lines, i);
345
346        let chunk_content: String = lines[start_line..=end_line].join("\n");
347
348        if chunk_content.len() >= config.min_chunk_size
349            && chunk_content.len() <= config.max_chunk_size
350        {
351            let start_byte = lines[..start_line].iter().map(|l| l.len() + 1).sum();
352            let end_byte = start_byte + chunk_content.len();
353
354            chunks.push(Chunk {
355                id: ChunkId::new(),
356                document_id: document.id.clone(),
357                content: chunk_content,
358                chunk_type,
359                start_line: start_line as u32 + 1,
360                end_line: end_line as u32 + 1,
361                start_byte,
362                end_byte,
363                symbol_name: name,
364                parent_symbol: None,
365            });
366        }
367
368        i = end_line + 1;
369    }
370
371    chunks
372}
373
374/// Generic chunking for unknown languages.
375fn extract_generic_chunks(
376    document: &Document,
377    content: &str,
378    config: &ChunkingConfig,
379) -> Vec<Chunk> {
380    let lines: Vec<&str> = content.lines().collect();
381    let mut chunks = Vec::new();
382
383    // Split into chunks by blank lines or size limits
384    let mut start = 0;
385    let mut current_chunk = String::new();
386
387    for (i, line) in lines.iter().enumerate() {
388        current_chunk.push_str(line);
389        current_chunk.push('\n');
390
391        // Check if we should create a chunk
392        let should_split = current_chunk.len() >= config.max_chunk_size
393            || (line.is_empty() && current_chunk.len() >= config.min_chunk_size);
394
395        if should_split {
396            let start_byte: usize = lines[..start].iter().map(|l| l.len() + 1).sum();
397
398            chunks.push(Chunk {
399                id: ChunkId::new(),
400                document_id: document.id.clone(),
401                content: current_chunk.trim().to_string(),
402                chunk_type: ChunkType::Block,
403                start_line: start as u32 + 1,
404                end_line: i as u32 + 1,
405                start_byte,
406                end_byte: start_byte + current_chunk.len(),
407                symbol_name: None,
408                parent_symbol: None,
409            });
410
411            current_chunk.clear();
412            start = i + 1;
413        }
414    }
415
416    // Don't forget the last chunk
417    if current_chunk.len() >= config.min_chunk_size {
418        let start_byte: usize = lines[..start].iter().map(|l| l.len() + 1).sum();
419
420        chunks.push(Chunk {
421            id: ChunkId::new(),
422            document_id: document.id.clone(),
423            content: current_chunk.trim().to_string(),
424            chunk_type: ChunkType::Block,
425            start_line: start as u32 + 1,
426            end_line: lines.len() as u32,
427            start_byte,
428            end_byte: start_byte + current_chunk.len(),
429            symbol_name: None,
430            parent_symbol: None,
431        });
432    }
433
434    chunks
435}
436
437// Helper functions
438
439fn extract_name_after(line: &str, keyword: &str) -> Option<String> {
440    line.find(keyword).and_then(|idx| {
441        let rest = &line[idx + keyword.len()..];
442        let end = rest
443            .find(|c: char| !c.is_alphanumeric() && c != '_')
444            .unwrap_or(rest.len());
445        if end > 0 {
446            Some(rest[..end].to_string())
447        } else {
448            None
449        }
450    })
451}
452
453fn extract_impl_name(line: &str) -> Option<String> {
454    // impl Type or impl Trait for Type
455    let rest = line.strip_prefix("impl")?.trim();
456    let end = rest
457        .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<')
458        .unwrap_or(rest.len());
459    if end > 0 {
460        Some(rest[..end].to_string())
461    } else {
462        None
463    }
464}
465
466fn extract_go_func_name(line: &str) -> Option<String> {
467    // func Name(...) or func (r Receiver) Name(...)
468    let rest = line.strip_prefix("func")?.trim();
469    if rest.starts_with('(') {
470        // Method with receiver
471        let after_receiver = rest.find(')')? + 1;
472        let name_part = rest[after_receiver..].trim();
473        extract_name_after(name_part, "")
474    } else {
475        extract_name_after(rest, "")
476    }
477}
478
479fn extract_java_method_name(line: &str) -> Option<String> {
480    // public void methodName(...)
481    let paren_idx = line.find('(')?;
482    let before_paren = &line[..paren_idx];
483    let words: Vec<&str> = before_paren.split_whitespace().collect();
484    words.last().map(|s| s.to_string())
485}
486
487fn find_block_end(lines: &[&str], start: usize) -> usize {
488    let mut brace_count = 0;
489    let mut found_open = false;
490
491    for (i, line) in lines.iter().enumerate().skip(start) {
492        for c in line.chars() {
493            if c == '{' {
494                brace_count += 1;
495                found_open = true;
496            } else if c == '}' {
497                brace_count -= 1;
498            }
499        }
500
501        if found_open && brace_count == 0 {
502            return i;
503        }
504    }
505
506    lines.len().saturating_sub(1)
507}
508
509fn find_python_block_end(lines: &[&str], start: usize, base_indent: usize) -> usize {
510    for (i, line) in lines.iter().enumerate().skip(start + 1) {
511        if line.trim().is_empty() {
512            continue;
513        }
514        let indent = line.len() - line.trim_start().len();
515        if indent <= base_indent {
516            return i.saturating_sub(1);
517        }
518    }
519    lines.len().saturating_sub(1)
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525    use std::path::PathBuf;
526
527    fn make_document(language: Language) -> Document {
528        Document {
529            id: crate::types::DocumentId::new(),
530            relative_path: PathBuf::from("test.rs"),
531            absolute_path: PathBuf::from("/test/test.rs"),
532            language,
533            content_hash: "test".to_string(),
534            size_bytes: 0,
535            modified_at: chrono::Utc::now(),
536        }
537    }
538
539    #[test]
540    fn test_rust_chunking() {
541        let doc = make_document(Language::Rust);
542        let content = r#"
543fn hello() {
544    println!("Hello");
545}
546
547pub struct Point {
548    x: i32,
549    y: i32,
550}
551
552impl Point {
553    fn new() -> Self {
554        Self { x: 0, y: 0 }
555    }
556}
557"#;
558        let config = ChunkingConfig::default();
559        let chunks = extract_rust_chunks(&doc, content, &config);
560
561        assert!(chunks.len() >= 2);
562        assert!(chunks.iter().any(|c| c.symbol_name == Some("hello".to_string())));
563    }
564
565    #[test]
566    fn test_python_chunking() {
567        let doc = make_document(Language::Python);
568        let content = r#"
569def hello():
570    print("Hello")
571
572class MyClass:
573    def __init__(self):
574        pass
575"#;
576        let config = ChunkingConfig::default();
577        let chunks = extract_python_chunks(&doc, content, &config);
578
579        assert!(!chunks.is_empty());
580    }
581}