cadi_core/
smart_chunker.rs

1//! Smart Chunker - Intelligent code analysis and chunking
2//!
3//! The SmartChunker analyzes code to determine the optimal way to atomize it:
4//! - Identifies natural boundaries (functions, types, modules)
5//! - Detects reusability patterns
6//! - Finds shared utilities that should be extracted
7//! - Determines if code should be one chunk or many
8//! - Handles composition detection
9
10use crate::atomic::{
11    AtomicChunk, ChunkCategory, ChunkGranularity,
12    ChunkMetrics, SourceLocation,
13};
14use serde::{Deserialize, Serialize};
15use sha2::{Digest, Sha256};
16use std::path::{Path, PathBuf};
17
18/// Configuration for smart chunking
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct SmartChunkerConfig {
21    /// Minimum lines for a function to be its own chunk
22    #[serde(default = "default_min_function_lines")]
23    pub min_function_lines: usize,
24
25    /// Minimum lines for a file to be chunked vs kept atomic
26    #[serde(default = "default_min_file_lines_to_split")]
27    pub min_file_lines_to_split: usize,
28
29    /// Maximum lines before forcing a split
30    #[serde(default = "default_max_chunk_lines")]
31    pub max_chunk_lines: usize,
32
33    /// Whether to extract utility functions as separate chunks
34    #[serde(default = "default_true")]
35    pub extract_utilities: bool,
36
37    /// Whether to create type-level chunks (structs, classes)
38    #[serde(default = "default_true")]
39    pub extract_types: bool,
40
41    /// Whether to group related functions together
42    #[serde(default = "default_true")]
43    pub group_related: bool,
44
45    /// Prefer atomic (single) chunks when possible
46    #[serde(default)]
47    pub prefer_atomic: bool,
48
49    /// Namespace for generated aliases
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub namespace: Option<String>,
52}
53
54fn default_min_function_lines() -> usize {
55    10
56}
57
58fn default_min_file_lines_to_split() -> usize {
59    50
60}
61
62fn default_max_chunk_lines() -> usize {
63    500
64}
65
66fn default_true() -> bool {
67    true
68}
69
70impl Default for SmartChunkerConfig {
71    fn default() -> Self {
72        Self {
73            min_function_lines: default_min_function_lines(),
74            min_file_lines_to_split: default_min_file_lines_to_split(),
75            max_chunk_lines: default_max_chunk_lines(),
76            extract_utilities: true,
77            extract_types: true,
78            group_related: true,
79            prefer_atomic: false,
80            namespace: None,
81        }
82    }
83}
84
85/// A detected code entity
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct CodeEntity {
88    pub name: String,
89    pub kind: EntityKind,
90    pub start_line: usize,
91    pub end_line: usize,
92    pub visibility: Visibility,
93    pub doc_comment: Option<String>,
94    pub imports: Vec<String>,
95    pub exports: Vec<String>,
96    pub calls: Vec<String>,
97    pub complexity: u32,
98}
99
100/// Kind of code entity
101#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
102pub enum EntityKind {
103    Function,
104    AsyncFunction,
105    Method,
106    Struct,
107    Class,
108    Trait,
109    Interface,
110    Enum,
111    Constant,
112    Module,
113    Type,
114    Test,
115    Macro,
116}
117
118/// Visibility of entity
119#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
120pub enum Visibility {
121    Public,
122    Private,
123    Internal,
124}
125
126/// Analysis result for a file
127#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct FileAnalysis {
129    pub path: PathBuf,
130    pub language: String,
131    pub total_lines: usize,
132    pub entities: Vec<CodeEntity>,
133    pub imports: Vec<String>,
134    pub exports: Vec<String>,
135    pub is_entrypoint: bool,
136    pub is_test: bool,
137    pub is_config: bool,
138    pub framework_hints: Vec<String>,
139    pub category: ChunkCategory,
140}
141
142/// Result of smart chunking analysis
143#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct ChunkingDecision {
145    pub file_path: PathBuf,
146    pub strategy: ChunkingStrategy,
147    pub suggested_chunks: Vec<SuggestedChunk>,
148    pub reasoning: String,
149}
150
151/// How to chunk a file
152#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub enum ChunkingStrategy {
154    /// Keep as single atomic chunk
155    Atomic,
156    /// Split by entities (functions, types)
157    ByEntity,
158    /// Split by logical sections
159    BySections,
160    /// Hierarchical (file + entity chunks)
161    Hierarchical,
162    /// Skip this file (not useful as a chunk)
163    Skip,
164}
165
166/// A suggested chunk from analysis
167#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct SuggestedChunk {
169    pub name: String,
170    pub alias: String,
171    pub start_line: usize,
172    pub end_line: usize,
173    pub granularity: ChunkGranularity,
174    pub category: ChunkCategory,
175    pub concepts: Vec<String>,
176    pub requires: Vec<String>,
177    pub provides: Vec<String>,
178}
179
180/// The SmartChunker - intelligent code analyzer and chunker
181pub struct SmartChunker {
182    config: SmartChunkerConfig,
183}
184
185impl SmartChunker {
186    /// Create a new SmartChunker with configuration
187    pub fn new(config: SmartChunkerConfig) -> Self {
188        Self { config }
189    }
190
191    /// Create with default configuration
192    pub fn default() -> Self {
193        Self::new(SmartChunkerConfig::default())
194    }
195
196    /// Analyze a file and determine chunking strategy
197    pub fn analyze_file(&self, path: &Path, content: &str) -> FileAnalysis {
198        let language = self.detect_language(path);
199        let lines: Vec<&str> = content.lines().collect();
200        let total_lines = lines.len();
201
202        let entities = self.extract_entities(content, &language);
203        let imports = self.extract_imports(content, &language);
204        let exports = self.extract_exports(content, &language, &entities);
205        let is_entrypoint = self.is_entrypoint(path, content, &language);
206        let is_test = self.is_test_file(path, content, &language);
207        let is_config = self.is_config_file(path);
208        let framework_hints = self.detect_frameworks(content, &language);
209        let category = self.categorize_file(path, &entities, is_test, is_config);
210
211        FileAnalysis {
212            path: path.to_path_buf(),
213            language,
214            total_lines,
215            entities,
216            imports,
217            exports,
218            is_entrypoint,
219            is_test,
220            is_config,
221            framework_hints,
222            category,
223        }
224    }
225
226    /// Decide how to chunk a file based on analysis
227    pub fn decide_chunking(&self, analysis: &FileAnalysis) -> ChunkingDecision {
228        let strategy;
229        let reasoning;
230        let mut suggested_chunks = Vec::new();
231
232        // Very small files -> atomic
233        if analysis.total_lines < self.config.min_file_lines_to_split {
234            strategy = ChunkingStrategy::Atomic;
235            reasoning = format!(
236                "File has {} lines (< {}), keeping atomic",
237                analysis.total_lines, self.config.min_file_lines_to_split
238            );
239            suggested_chunks.push(self.create_file_chunk(analysis));
240        }
241        // Config files -> atomic
242        else if analysis.is_config {
243            strategy = ChunkingStrategy::Atomic;
244            reasoning = "Configuration file, keeping atomic".to_string();
245            suggested_chunks.push(self.create_file_chunk(analysis));
246        }
247        // Single large entity -> atomic
248        else if analysis.entities.len() == 1 {
249            strategy = ChunkingStrategy::Atomic;
250            reasoning = "Single entity file, keeping atomic".to_string();
251            suggested_chunks.push(self.create_file_chunk(analysis));
252        }
253        // Has clear entity structure -> split by entity
254        else if analysis.entities.len() > 1 && self.has_clear_structure(&analysis.entities) {
255            strategy = ChunkingStrategy::ByEntity;
256            reasoning = format!(
257                "Found {} distinct entities, splitting by entity",
258                analysis.entities.len()
259            );
260            suggested_chunks = self.create_entity_chunks(analysis);
261        }
262        // Large file with mixed content -> hierarchical
263        else if analysis.total_lines > self.config.max_chunk_lines {
264            strategy = ChunkingStrategy::Hierarchical;
265            reasoning = format!(
266                "Large file ({} lines), creating hierarchical chunks",
267                analysis.total_lines
268            );
269            // Parent chunk
270            suggested_chunks.push(self.create_file_chunk(analysis));
271            // Child chunks
272            suggested_chunks.extend(self.create_entity_chunks(analysis));
273        }
274        // Default: atomic if prefer_atomic, otherwise by entity
275        else if self.config.prefer_atomic {
276            strategy = ChunkingStrategy::Atomic;
277            reasoning = "Preferring atomic chunks".to_string();
278            suggested_chunks.push(self.create_file_chunk(analysis));
279        } else {
280            strategy = ChunkingStrategy::ByEntity;
281            reasoning = "Splitting by code entities".to_string();
282            suggested_chunks = self.create_entity_chunks(analysis);
283        }
284
285        ChunkingDecision {
286            file_path: analysis.path.clone(),
287            strategy,
288            suggested_chunks,
289            reasoning,
290        }
291    }
292
293    /// Generate atomic chunks from content
294    pub fn generate_chunks(
295        &self,
296        path: &Path,
297        content: &str,
298        decision: &ChunkingDecision,
299    ) -> Vec<AtomicChunk> {
300        let mut chunks = Vec::new();
301        let lines: Vec<&str> = content.lines().collect();
302
303        for suggested in &decision.suggested_chunks {
304            let chunk_content = if suggested.start_line == 0 && suggested.end_line >= lines.len() {
305                content.to_string()
306            } else {
307                let start = suggested.start_line.saturating_sub(1);
308                let end = suggested.end_line.min(lines.len());
309                lines[start..end].join("\n")
310            };
311
312            let content_hash = compute_hash(&chunk_content);
313            let chunk_id = format!("chunk:sha256:{}", content_hash);
314
315            let analysis = self.analyze_file(path, content);
316
317            let mut chunk = AtomicChunk::new(
318                chunk_id,
319                suggested.name.clone(),
320                analysis.language.clone(),
321                content_hash,
322                chunk_content.len(),
323            )
324            .with_alias(&suggested.alias)
325            .with_granularity(suggested.granularity)
326            .with_categories(vec![suggested.category.clone()])
327            .with_concepts(suggested.concepts.clone());
328
329            chunk.provides = suggested.provides.clone();
330            chunk.requires = suggested.requires.clone();
331            chunk.sources = vec![SourceLocation {
332                file: path.to_string_lossy().to_string(),
333                start_line: Some(suggested.start_line),
334                end_line: Some(suggested.end_line),
335                start_col: None,
336                end_col: None,
337            }];
338            chunk.metrics = ChunkMetrics {
339                loc: suggested.end_line - suggested.start_line + 1,
340                ..Default::default()
341            };
342
343            chunks.push(chunk);
344        }
345
346        chunks
347    }
348
349    // ========================================================================
350    // Private helper methods
351    // ========================================================================
352
353    fn detect_language(&self, path: &Path) -> String {
354        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
355        match ext {
356            "rs" => "rust".to_string(),
357            "ts" | "tsx" => "typescript".to_string(),
358            "js" | "jsx" | "mjs" | "cjs" => "javascript".to_string(),
359            "py" | "pyi" => "python".to_string(),
360            "go" => "go".to_string(),
361            "c" | "h" => "c".to_string(),
362            "cpp" | "cc" | "cxx" | "hpp" | "hh" => "cpp".to_string(),
363            "java" => "java".to_string(),
364            "rb" => "ruby".to_string(),
365            "swift" => "swift".to_string(),
366            "kt" | "kts" => "kotlin".to_string(),
367            "cs" => "csharp".to_string(),
368            "php" => "php".to_string(),
369            "scala" => "scala".to_string(),
370            "zig" => "zig".to_string(),
371            "md" | "markdown" => "markdown".to_string(),
372            "json" => "json".to_string(),
373            "yaml" | "yml" => "yaml".to_string(),
374            "toml" => "toml".to_string(),
375            "html" | "htm" => "html".to_string(),
376            "css" => "css".to_string(),
377            "scss" | "sass" => "scss".to_string(),
378            "sql" => "sql".to_string(),
379            "sh" | "bash" | "zsh" => "shell".to_string(),
380            "dockerfile" => "dockerfile".to_string(),
381            _ => "unknown".to_string(),
382        }
383    }
384
385    fn extract_entities(&self, content: &str, language: &str) -> Vec<CodeEntity> {
386        let mut entities = Vec::new();
387        let lines: Vec<&str> = content.lines().collect();
388
389        match language {
390            "rust" => self.extract_rust_entities(&lines, &mut entities),
391            "typescript" | "javascript" => self.extract_ts_entities(&lines, &mut entities),
392            "python" => self.extract_python_entities(&lines, &mut entities),
393            "go" => self.extract_go_entities(&lines, &mut entities),
394            _ => {}
395        }
396
397        entities
398    }
399
400    fn extract_rust_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
401        let mut current_entity: Option<(String, EntityKind, usize, Visibility, Option<String>)> =
402            None;
403        let mut brace_depth = 0;
404        let mut doc_comment = String::new();
405
406        for (i, line) in lines.iter().enumerate() {
407            let trimmed = line.trim();
408
409            // Collect doc comments
410            if trimmed.starts_with("///") || trimmed.starts_with("//!") {
411                doc_comment.push_str(trimmed.trim_start_matches('/').trim());
412                doc_comment.push('\n');
413                continue;
414            }
415
416            // Detect entity starts
417            let (is_pub, rest) = if trimmed.starts_with("pub ") {
418                (true, &trimmed[4..])
419            } else {
420                (false, trimmed)
421            };
422
423            let visibility = if is_pub {
424                Visibility::Public
425            } else {
426                Visibility::Private
427            };
428
429            if rest.starts_with("fn ")
430                || rest.starts_with("async fn ")
431                || rest.starts_with("const fn ")
432                || rest.starts_with("unsafe fn ")
433            {
434                let is_async = rest.starts_with("async");
435                if let Some(name) = self.extract_rust_fn_name(rest) {
436                    if current_entity.is_some() {
437                        self.close_entity(current_entity.take(), i, entities);
438                    }
439                    let kind = if is_async {
440                        EntityKind::AsyncFunction
441                    } else {
442                        EntityKind::Function
443                    };
444                    let doc = if doc_comment.is_empty() {
445                        None
446                    } else {
447                        Some(doc_comment.trim().to_string())
448                    };
449                    current_entity = Some((name, kind, i + 1, visibility, doc));
450                    brace_depth = 0;
451                }
452            } else if rest.starts_with("struct ") {
453                if let Some(name) = self.extract_rust_type_name(rest, "struct ") {
454                    if current_entity.is_some() {
455                        self.close_entity(current_entity.take(), i, entities);
456                    }
457                    let doc = if doc_comment.is_empty() {
458                        None
459                    } else {
460                        Some(doc_comment.trim().to_string())
461                    };
462                    current_entity = Some((name, EntityKind::Struct, i + 1, visibility, doc));
463                    brace_depth = 0;
464                }
465            } else if rest.starts_with("enum ") {
466                if let Some(name) = self.extract_rust_type_name(rest, "enum ") {
467                    if current_entity.is_some() {
468                        self.close_entity(current_entity.take(), i, entities);
469                    }
470                    let doc = if doc_comment.is_empty() {
471                        None
472                    } else {
473                        Some(doc_comment.trim().to_string())
474                    };
475                    current_entity = Some((name, EntityKind::Enum, i + 1, visibility, doc));
476                    brace_depth = 0;
477                }
478            } else if rest.starts_with("trait ") {
479                if let Some(name) = self.extract_rust_type_name(rest, "trait ") {
480                    if current_entity.is_some() {
481                        self.close_entity(current_entity.take(), i, entities);
482                    }
483                    let doc = if doc_comment.is_empty() {
484                        None
485                    } else {
486                        Some(doc_comment.trim().to_string())
487                    };
488                    current_entity = Some((name, EntityKind::Trait, i + 1, visibility, doc));
489                    brace_depth = 0;
490                }
491            } else if rest.starts_with("impl ") || rest.starts_with("impl<") {
492                if let Some(name) = self.extract_impl_name(rest) {
493                    if current_entity.is_some() {
494                        self.close_entity(current_entity.take(), i, entities);
495                    }
496                    let doc = if doc_comment.is_empty() {
497                        None
498                    } else {
499                        Some(doc_comment.trim().to_string())
500                    };
501                    current_entity =
502                        Some((format!("impl_{}", name), EntityKind::Module, i + 1, visibility, doc));
503                    brace_depth = 0;
504                }
505            }
506
507            // Track braces for entity end
508            brace_depth += trimmed.matches('{').count() as i32;
509            brace_depth -= trimmed.matches('}').count() as i32;
510
511            if brace_depth <= 0 && current_entity.is_some() {
512                self.close_entity(current_entity.take(), i + 1, entities);
513            }
514
515            // Clear doc comment if not immediately followed by entity
516            if !trimmed.starts_with("///")
517                && !trimmed.starts_with("//!")
518                && !trimmed.starts_with("#[")
519                && !trimmed.is_empty()
520            {
521                doc_comment.clear();
522            }
523        }
524
525        // Close any remaining entity
526        if let Some(entity) = current_entity {
527            self.close_entity(Some(entity), lines.len(), entities);
528        }
529    }
530
531    fn extract_rust_fn_name(&self, line: &str) -> Option<String> {
532        let rest = line
533            .trim_start_matches("async ")
534            .trim_start_matches("const ")
535            .trim_start_matches("unsafe ")
536            .trim_start_matches("fn ");
537        let name_end = rest.find('(').or_else(|| rest.find('<'))?;
538        Some(rest[..name_end].trim().to_string())
539    }
540
541    fn extract_rust_type_name(&self, line: &str, prefix: &str) -> Option<String> {
542        let rest = line.trim_start_matches(prefix);
543        let name_end = rest
544            .find(|c: char| !c.is_alphanumeric() && c != '_')
545            .unwrap_or(rest.len());
546        if name_end > 0 {
547            Some(rest[..name_end].to_string())
548        } else {
549            None
550        }
551    }
552
553    fn extract_impl_name(&self, line: &str) -> Option<String> {
554        // Handle "impl Trait for Type" and "impl Type"
555        let rest = line.trim_start_matches("impl").trim_start_matches('<');
556        // Skip generic params
557        let rest = if let Some(idx) = rest.find('>') {
558            &rest[idx + 1..]
559        } else {
560            rest
561        };
562        let rest = rest.trim();
563        
564        if let Some(idx) = rest.find(" for ") {
565            // impl Trait for Type
566            let type_name = rest[idx + 5..].split_whitespace().next()?;
567            Some(type_name.to_string())
568        } else {
569            // impl Type
570            let type_name = rest.split_whitespace().next()?;
571            Some(type_name.to_string())
572        }
573    }
574
575    fn extract_ts_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
576        let mut current_entity: Option<(String, EntityKind, usize, Visibility, Option<String>)> =
577            None;
578        let mut brace_depth = 0;
579
580        for (i, line) in lines.iter().enumerate() {
581            let trimmed = line.trim();
582
583            // Detect entity starts
584            let is_export = trimmed.starts_with("export ");
585            let rest = if is_export {
586                &trimmed[7..]
587            } else {
588                trimmed
589            };
590
591            let visibility = if is_export {
592                Visibility::Public
593            } else {
594                Visibility::Private
595            };
596
597            if rest.starts_with("function ")
598                || rest.starts_with("async function ")
599                || (rest.starts_with("const ") && rest.contains("=>"))
600            {
601                let is_async = rest.contains("async");
602                if let Some(name) = self.extract_ts_fn_name(rest) {
603                    if current_entity.is_some() {
604                        self.close_entity(current_entity.take(), i, entities);
605                    }
606                    let kind = if is_async {
607                        EntityKind::AsyncFunction
608                    } else {
609                        EntityKind::Function
610                    };
611                    current_entity = Some((name, kind, i + 1, visibility, None));
612                    brace_depth = 0;
613                }
614            } else if rest.starts_with("class ") {
615                if let Some(name) = self.extract_ts_class_name(rest) {
616                    if current_entity.is_some() {
617                        self.close_entity(current_entity.take(), i, entities);
618                    }
619                    current_entity = Some((name, EntityKind::Class, i + 1, visibility, None));
620                    brace_depth = 0;
621                }
622            } else if rest.starts_with("interface ") {
623                if let Some(name) = self.extract_ts_interface_name(rest) {
624                    if current_entity.is_some() {
625                        self.close_entity(current_entity.take(), i, entities);
626                    }
627                    current_entity = Some((name, EntityKind::Interface, i + 1, visibility, None));
628                    brace_depth = 0;
629                }
630            } else if rest.starts_with("type ") {
631                if let Some(name) = self.extract_ts_type_name(rest) {
632                    if current_entity.is_some() {
633                        self.close_entity(current_entity.take(), i, entities);
634                    }
635                    current_entity = Some((name, EntityKind::Type, i + 1, visibility, None));
636                    brace_depth = 0;
637                }
638            } else if rest.starts_with("enum ") {
639                if let Some(name) = self.extract_ts_enum_name(rest) {
640                    if current_entity.is_some() {
641                        self.close_entity(current_entity.take(), i, entities);
642                    }
643                    current_entity = Some((name, EntityKind::Enum, i + 1, visibility, None));
644                    brace_depth = 0;
645                }
646            }
647
648            // Track braces
649            brace_depth += trimmed.matches('{').count() as i32;
650            brace_depth -= trimmed.matches('}').count() as i32;
651
652            if brace_depth <= 0 && current_entity.is_some() && trimmed.contains('}') {
653                self.close_entity(current_entity.take(), i + 1, entities);
654            }
655        }
656
657        if let Some(entity) = current_entity {
658            self.close_entity(Some(entity), lines.len(), entities);
659        }
660    }
661
662    fn extract_ts_fn_name(&self, line: &str) -> Option<String> {
663        if line.starts_with("const ") {
664            // Arrow function: const name = ...
665            let rest = line.trim_start_matches("const ");
666            let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
667            return Some(rest[..name_end].to_string());
668        }
669        let rest = line
670            .trim_start_matches("async ")
671            .trim_start_matches("function ");
672        let name_end = rest.find('(')?;
673        Some(rest[..name_end].trim().to_string())
674    }
675
676    fn extract_ts_class_name(&self, line: &str) -> Option<String> {
677        let rest = line.trim_start_matches("class ");
678        let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
679        Some(rest[..name_end].to_string())
680    }
681
682    fn extract_ts_interface_name(&self, line: &str) -> Option<String> {
683        let rest = line.trim_start_matches("interface ");
684        let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
685        Some(rest[..name_end].to_string())
686    }
687
688    fn extract_ts_type_name(&self, line: &str) -> Option<String> {
689        let rest = line.trim_start_matches("type ");
690        let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
691        Some(rest[..name_end].to_string())
692    }
693
694    fn extract_ts_enum_name(&self, line: &str) -> Option<String> {
695        let rest = line.trim_start_matches("enum ");
696        let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
697        Some(rest[..name_end].to_string())
698    }
699
700    fn extract_python_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
701        let mut current_entity: Option<(String, EntityKind, usize, usize, Visibility)> = None;
702
703        for (i, line) in lines.iter().enumerate() {
704            let trimmed = line.trim();
705            let indent = line.len() - line.trim_start().len();
706
707            // Top-level definitions only (no indentation)
708            if indent == 0 {
709                // Close previous if exists
710                if let Some((name, kind, start, _, vis)) = current_entity.take() {
711                    entities.push(CodeEntity {
712                        name,
713                        kind,
714                        start_line: start,
715                        end_line: i,
716                        visibility: vis,
717                        doc_comment: None,
718                        imports: Vec::new(),
719                        exports: Vec::new(),
720                        calls: Vec::new(),
721                        complexity: 1,
722                    });
723                }
724
725                if trimmed.starts_with("def ") || trimmed.starts_with("async def ") {
726                    let is_async = trimmed.starts_with("async");
727                    if let Some(name) = self.extract_python_fn_name(trimmed) {
728                        let visibility = if name.starts_with('_') {
729                            Visibility::Private
730                        } else {
731                            Visibility::Public
732                        };
733                        let kind = if is_async {
734                            EntityKind::AsyncFunction
735                        } else {
736                            EntityKind::Function
737                        };
738                        current_entity = Some((name, kind, i + 1, indent, visibility));
739                    }
740                } else if trimmed.starts_with("class ") {
741                    if let Some(name) = self.extract_python_class_name(trimmed) {
742                        let visibility = if name.starts_with('_') {
743                            Visibility::Private
744                        } else {
745                            Visibility::Public
746                        };
747                        current_entity = Some((name, EntityKind::Class, i + 1, indent, visibility));
748                    }
749                }
750            }
751        }
752
753        // Close final entity
754        if let Some((name, kind, start, _, vis)) = current_entity {
755            entities.push(CodeEntity {
756                name,
757                kind,
758                start_line: start,
759                end_line: lines.len(),
760                visibility: vis,
761                doc_comment: None,
762                imports: Vec::new(),
763                exports: Vec::new(),
764                calls: Vec::new(),
765                complexity: 1,
766            });
767        }
768    }
769
770    fn extract_python_fn_name(&self, line: &str) -> Option<String> {
771        let rest = line
772            .trim_start_matches("async ")
773            .trim_start_matches("def ");
774        let name_end = rest.find('(')?;
775        Some(rest[..name_end].trim().to_string())
776    }
777
778    fn extract_python_class_name(&self, line: &str) -> Option<String> {
779        let rest = line.trim_start_matches("class ");
780        let name_end = rest.find(['(', ':'])?;
781        Some(rest[..name_end].trim().to_string())
782    }
783
784    fn extract_go_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
785        let mut current_entity: Option<(String, EntityKind, usize, Visibility)> = None;
786        let mut brace_depth = 0;
787
788        for (i, line) in lines.iter().enumerate() {
789            let trimmed = line.trim();
790
791            if trimmed.starts_with("func ") {
792                if current_entity.is_some() {
793                    self.close_entity_simple(current_entity.take(), i, entities);
794                }
795                if let Some(name) = self.extract_go_fn_name(trimmed) {
796                    let visibility = if name.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
797                    {
798                        Visibility::Public
799                    } else {
800                        Visibility::Private
801                    };
802                    current_entity = Some((name, EntityKind::Function, i + 1, visibility));
803                    brace_depth = 0;
804                }
805            } else if trimmed.starts_with("type ") && trimmed.contains("struct") {
806                if current_entity.is_some() {
807                    self.close_entity_simple(current_entity.take(), i, entities);
808                }
809                if let Some(name) = self.extract_go_type_name(trimmed) {
810                    let visibility = if name.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
811                    {
812                        Visibility::Public
813                    } else {
814                        Visibility::Private
815                    };
816                    current_entity = Some((name, EntityKind::Struct, i + 1, visibility));
817                    brace_depth = 0;
818                }
819            } else if trimmed.starts_with("type ") && trimmed.contains("interface") {
820                if current_entity.is_some() {
821                    self.close_entity_simple(current_entity.take(), i, entities);
822                }
823                if let Some(name) = self.extract_go_type_name(trimmed) {
824                    let visibility = if name.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
825                    {
826                        Visibility::Public
827                    } else {
828                        Visibility::Private
829                    };
830                    current_entity = Some((name, EntityKind::Interface, i + 1, visibility));
831                    brace_depth = 0;
832                }
833            }
834
835            brace_depth += trimmed.matches('{').count() as i32;
836            brace_depth -= trimmed.matches('}').count() as i32;
837
838            if brace_depth <= 0 && current_entity.is_some() && trimmed.contains('}') {
839                self.close_entity_simple(current_entity.take(), i + 1, entities);
840            }
841        }
842
843        if let Some(entity) = current_entity {
844            self.close_entity_simple(Some(entity), lines.len(), entities);
845        }
846    }
847
848    fn extract_go_fn_name(&self, line: &str) -> Option<String> {
849        let rest = line.trim_start_matches("func ");
850        // Handle method receivers: (r *Receiver)
851        let rest = if rest.starts_with('(') {
852            if let Some(idx) = rest.find(')') {
853                &rest[idx + 1..]
854            } else {
855                rest
856            }
857        } else {
858            rest
859        };
860        let rest = rest.trim();
861        let name_end = rest.find('(')?;
862        Some(rest[..name_end].trim().to_string())
863    }
864
865    fn extract_go_type_name(&self, line: &str) -> Option<String> {
866        let rest = line.trim_start_matches("type ");
867        let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
868        Some(rest[..name_end].to_string())
869    }
870
871    fn close_entity(
872        &self,
873        entity: Option<(String, EntityKind, usize, Visibility, Option<String>)>,
874        end_line: usize,
875        entities: &mut Vec<CodeEntity>,
876    ) {
877        if let Some((name, kind, start, visibility, doc)) = entity {
878            entities.push(CodeEntity {
879                name,
880                kind,
881                start_line: start,
882                end_line,
883                visibility,
884                doc_comment: doc,
885                imports: Vec::new(),
886                exports: Vec::new(),
887                calls: Vec::new(),
888                complexity: 1,
889            });
890        }
891    }
892
893    fn close_entity_simple(
894        &self,
895        entity: Option<(String, EntityKind, usize, Visibility)>,
896        end_line: usize,
897        entities: &mut Vec<CodeEntity>,
898    ) {
899        if let Some((name, kind, start, visibility)) = entity {
900            entities.push(CodeEntity {
901                name,
902                kind,
903                start_line: start,
904                end_line,
905                visibility,
906                doc_comment: None,
907                imports: Vec::new(),
908                exports: Vec::new(),
909                calls: Vec::new(),
910                complexity: 1,
911            });
912        }
913    }
914
915    fn extract_imports(&self, content: &str, language: &str) -> Vec<String> {
916        let mut imports = Vec::new();
917
918        match language {
919            "rust" => {
920                for line in content.lines() {
921                    let trimmed = line.trim();
922                    if trimmed.starts_with("use ") {
923                        let import = trimmed.trim_start_matches("use ").trim_end_matches(';');
924                        imports.push(import.to_string());
925                    }
926                }
927            }
928            "typescript" | "javascript" => {
929                for line in content.lines() {
930                    let trimmed = line.trim();
931                    if trimmed.starts_with("import ") {
932                        imports.push(trimmed.to_string());
933                    }
934                }
935            }
936            "python" => {
937                for line in content.lines() {
938                    let trimmed = line.trim();
939                    if trimmed.starts_with("import ") || trimmed.starts_with("from ") {
940                        imports.push(trimmed.to_string());
941                    }
942                }
943            }
944            "go" => {
945                for line in content.lines() {
946                    let trimmed = line.trim();
947                    if trimmed.starts_with("import ") || trimmed.starts_with("\"") {
948                        imports.push(trimmed.to_string());
949                    }
950                }
951            }
952            _ => {}
953        }
954
955        imports
956    }
957
958    fn extract_exports(&self, _content: &str, _language: &str, entities: &[CodeEntity]) -> Vec<String> {
959        entities
960            .iter()
961            .filter(|e| e.visibility == Visibility::Public)
962            .map(|e| e.name.clone())
963            .collect()
964    }
965
966    fn is_entrypoint(&self, path: &Path, content: &str, language: &str) -> bool {
967        let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
968
969        match language {
970            "rust" => file_name == "main.rs" || content.contains("fn main()"),
971            "typescript" | "javascript" => {
972                file_name == "index.ts"
973                    || file_name == "index.tsx"
974                    || file_name == "index.js"
975                    || file_name == "main.ts"
976            }
977            "python" => {
978                file_name == "__main__.py"
979                    || file_name == "main.py"
980                    || content.contains("if __name__")
981            }
982            "go" => file_name == "main.go" || content.contains("func main()"),
983            _ => false,
984        }
985    }
986
987    fn is_test_file(&self, path: &Path, content: &str, language: &str) -> bool {
988        let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
989        let path_str = path.to_string_lossy();
990
991        match language {
992            "rust" => {
993                path_str.contains("/tests/")
994                    || content.contains("#[test]")
995                    || content.contains("#[cfg(test)]")
996            }
997            "typescript" | "javascript" => {
998                file_name.contains(".test.")
999                    || file_name.contains(".spec.")
1000                    || path_str.contains("__tests__")
1001            }
1002            "python" => {
1003                file_name.starts_with("test_")
1004                    || file_name.ends_with("_test.py")
1005                    || path_str.contains("/tests/")
1006            }
1007            "go" => file_name.ends_with("_test.go"),
1008            _ => false,
1009        }
1010    }
1011
1012    fn is_config_file(&self, path: &Path) -> bool {
1013        let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
1014        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1015
1016        matches!(
1017            file_name,
1018            "Cargo.toml"
1019                | "package.json"
1020                | "tsconfig.json"
1021                | "pyproject.toml"
1022                | "setup.py"
1023                | "go.mod"
1024                | "CMakeLists.txt"
1025                | "Makefile"
1026                | ".eslintrc.json"
1027                | ".prettierrc"
1028                | "webpack.config.js"
1029                | "vite.config.ts"
1030                | "tailwind.config.js"
1031        ) || matches!(ext, "toml" | "yaml" | "yml" | "json")
1032            && (file_name.contains("config") || file_name.contains("settings"))
1033    }
1034
1035    fn detect_frameworks(&self, content: &str, language: &str) -> Vec<String> {
1036        let mut frameworks = Vec::new();
1037
1038        match language {
1039            "typescript" | "javascript" => {
1040                if content.contains("react") || content.contains("React") {
1041                    frameworks.push("react".to_string());
1042                }
1043                if content.contains("@angular") {
1044                    frameworks.push("angular".to_string());
1045                }
1046                if content.contains("vue") {
1047                    frameworks.push("vue".to_string());
1048                }
1049                if content.contains("next") {
1050                    frameworks.push("nextjs".to_string());
1051                }
1052                if content.contains("express") {
1053                    frameworks.push("express".to_string());
1054                }
1055            }
1056            "rust" => {
1057                if content.contains("actix") {
1058                    frameworks.push("actix".to_string());
1059                }
1060                if content.contains("tokio") {
1061                    frameworks.push("tokio".to_string());
1062                }
1063                if content.contains("axum") {
1064                    frameworks.push("axum".to_string());
1065                }
1066                if content.contains("serde") {
1067                    frameworks.push("serde".to_string());
1068                }
1069            }
1070            "python" => {
1071                if content.contains("django") {
1072                    frameworks.push("django".to_string());
1073                }
1074                if content.contains("flask") {
1075                    frameworks.push("flask".to_string());
1076                }
1077                if content.contains("fastapi") {
1078                    frameworks.push("fastapi".to_string());
1079                }
1080            }
1081            _ => {}
1082        }
1083
1084        frameworks
1085    }
1086
1087    fn categorize_file(
1088        &self,
1089        path: &Path,
1090        entities: &[CodeEntity],
1091        is_test: bool,
1092        is_config: bool,
1093    ) -> ChunkCategory {
1094        if is_test {
1095            return ChunkCategory::Test;
1096        }
1097        if is_config {
1098            return ChunkCategory::Config;
1099        }
1100
1101        let path_str = path.to_string_lossy().to_lowercase();
1102        let file_name = path
1103            .file_name()
1104            .and_then(|n| n.to_str())
1105            .unwrap_or("")
1106            .to_lowercase();
1107
1108        // Path-based categorization
1109        if path_str.contains("/api/") || path_str.contains("/routes/") {
1110            return ChunkCategory::Api;
1111        }
1112        if path_str.contains("/ui/")
1113            || path_str.contains("/components/")
1114            || path_str.contains("/views/")
1115        {
1116            return ChunkCategory::Ui;
1117        }
1118        if path_str.contains("/utils/") || path_str.contains("/helpers/") {
1119            return ChunkCategory::Utility;
1120        }
1121        if path_str.contains("/models/") || path_str.contains("/types/") {
1122            return ChunkCategory::Data;
1123        }
1124        if path_str.contains("/db/") || path_str.contains("/database/") {
1125            return ChunkCategory::Database;
1126        }
1127
1128        // Name-based categorization
1129        if file_name.contains("util") || file_name.contains("helper") {
1130            return ChunkCategory::Utility;
1131        }
1132        if file_name.contains("type") || file_name.contains("model") {
1133            return ChunkCategory::Data;
1134        }
1135
1136        // Entity-based categorization
1137        let type_count = entities
1138            .iter()
1139            .filter(|e| {
1140                matches!(
1141                    e.kind,
1142                    EntityKind::Struct
1143                        | EntityKind::Class
1144                        | EntityKind::Interface
1145                        | EntityKind::Enum
1146                        | EntityKind::Type
1147                )
1148            })
1149            .count();
1150
1151        if type_count > entities.len() / 2 {
1152            return ChunkCategory::Data;
1153        }
1154
1155        ChunkCategory::Logic
1156    }
1157
1158    fn has_clear_structure(&self, entities: &[CodeEntity]) -> bool {
1159        // Has multiple significant entities
1160        let significant = entities
1161            .iter()
1162            .filter(|e| e.end_line - e.start_line >= self.config.min_function_lines)
1163            .count();
1164
1165        significant >= 2
1166    }
1167
1168    fn create_file_chunk(&self, analysis: &FileAnalysis) -> SuggestedChunk {
1169        let file_name = analysis
1170            .path
1171            .file_stem()
1172            .and_then(|n| n.to_str())
1173            .unwrap_or("unknown");
1174
1175        let alias = self.generate_alias(file_name, &analysis.path);
1176
1177        SuggestedChunk {
1178            name: file_name.to_string(),
1179            alias,
1180            start_line: 1,
1181            end_line: analysis.total_lines,
1182            granularity: ChunkGranularity::Module,
1183            category: analysis.category.clone(),
1184            concepts: analysis.exports.clone(),
1185            requires: analysis.imports.clone(),
1186            provides: analysis.exports.clone(),
1187        }
1188    }
1189
1190    fn create_entity_chunks(&self, analysis: &FileAnalysis) -> Vec<SuggestedChunk> {
1191        let file_stem = analysis
1192            .path
1193            .file_stem()
1194            .and_then(|n| n.to_str())
1195            .unwrap_or("unknown");
1196
1197        analysis
1198            .entities
1199            .iter()
1200            .filter(|e| e.end_line - e.start_line >= self.config.min_function_lines)
1201            .map(|entity| {
1202                let granularity = match entity.kind {
1203                    EntityKind::Function | EntityKind::AsyncFunction | EntityKind::Method => {
1204                        ChunkGranularity::Function
1205                    }
1206                    EntityKind::Struct
1207                    | EntityKind::Class
1208                    | EntityKind::Trait
1209                    | EntityKind::Interface
1210                    | EntityKind::Enum => ChunkGranularity::Type,
1211                    EntityKind::Module => ChunkGranularity::Module,
1212                    _ => ChunkGranularity::Function,
1213                };
1214
1215                let alias = format!("{}/{}", file_stem, to_kebab_case(&entity.name));
1216
1217                SuggestedChunk {
1218                    name: entity.name.clone(),
1219                    alias,
1220                    start_line: entity.start_line,
1221                    end_line: entity.end_line,
1222                    granularity,
1223                    category: analysis.category.clone(),
1224                    concepts: vec![entity.name.clone()],
1225                    requires: entity.imports.clone(),
1226                    provides: if entity.visibility == Visibility::Public {
1227                        vec![entity.name.clone()]
1228                    } else {
1229                        Vec::new()
1230                    },
1231                }
1232            })
1233            .collect()
1234    }
1235
1236    fn generate_alias(&self, name: &str, path: &Path) -> String {
1237        let parent = path
1238            .parent()
1239            .and_then(|p| p.file_name())
1240            .and_then(|n| n.to_str())
1241            .unwrap_or("");
1242
1243        if parent.is_empty() || parent == "src" {
1244            to_kebab_case(name)
1245        } else {
1246            format!("{}/{}", to_kebab_case(parent), to_kebab_case(name))
1247        }
1248    }
1249}
1250
1251/// Convert string to kebab-case
1252fn to_kebab_case(s: &str) -> String {
1253    let mut result = String::new();
1254    for (i, c) in s.chars().enumerate() {
1255        if c.is_uppercase() && i > 0 {
1256            result.push('-');
1257        }
1258        result.push(c.to_ascii_lowercase());
1259    }
1260    result
1261        .replace(['_', ' '], "-")
1262        .replace("--", "-")
1263}
1264
1265/// Compute SHA256 hash
1266fn compute_hash(content: &str) -> String {
1267    let mut hasher = Sha256::new();
1268    hasher.update(content.as_bytes());
1269    let result = hasher.finalize();
1270    hex::encode(result)
1271}
1272
1273#[cfg(test)]
1274mod tests {
1275    use super::*;
1276
1277    #[test]
1278    fn test_language_detection() {
1279        let chunker = SmartChunker::default();
1280        assert_eq!(
1281            chunker.detect_language(Path::new("test.rs")),
1282            "rust"
1283        );
1284        assert_eq!(
1285            chunker.detect_language(Path::new("test.ts")),
1286            "typescript"
1287        );
1288        assert_eq!(
1289            chunker.detect_language(Path::new("test.py")),
1290            "python"
1291        );
1292    }
1293
1294    #[test]
1295    fn test_kebab_case() {
1296        assert_eq!(to_kebab_case("HelloWorld"), "hello-world");
1297        assert_eq!(to_kebab_case("hello_world"), "hello-world");
1298        assert_eq!(to_kebab_case("my-file"), "my-file");
1299    }
1300
1301    #[test]
1302    fn test_rust_entity_extraction() {
1303        let chunker = SmartChunker::default();
1304        let content = r#"
1305pub fn hello_world() {
1306    println!("Hello!");
1307}
1308
1309struct MyStruct {
1310    field: i32,
1311}
1312
1313impl MyStruct {
1314    fn new() -> Self {
1315        Self { field: 0 }
1316    }
1317}
1318"#;
1319        let lines: Vec<&str> = content.lines().collect();
1320        let mut entities = Vec::new();
1321        chunker.extract_rust_entities(&lines, &mut entities);
1322
1323        assert!(entities.len() >= 2);
1324    }
1325}