1use crate::atomic::{
11 AtomicChunk, ChunkCategory, ChunkGranularity,
12 ChunkMetrics, SourceLocation,
13};
14use serde::{Deserialize, Serialize};
15use sha2::{Digest, Sha256};
16use std::path::{Path, PathBuf};
17
18#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct SmartChunkerConfig {
21 #[serde(default = "default_min_function_lines")]
23 pub min_function_lines: usize,
24
25 #[serde(default = "default_min_file_lines_to_split")]
27 pub min_file_lines_to_split: usize,
28
29 #[serde(default = "default_max_chunk_lines")]
31 pub max_chunk_lines: usize,
32
33 #[serde(default = "default_true")]
35 pub extract_utilities: bool,
36
37 #[serde(default = "default_true")]
39 pub extract_types: bool,
40
41 #[serde(default = "default_true")]
43 pub group_related: bool,
44
45 #[serde(default)]
47 pub prefer_atomic: bool,
48
49 #[serde(skip_serializing_if = "Option::is_none")]
51 pub namespace: Option<String>,
52}
53
54fn default_min_function_lines() -> usize {
55 10
56}
57
58fn default_min_file_lines_to_split() -> usize {
59 50
60}
61
62fn default_max_chunk_lines() -> usize {
63 500
64}
65
66fn default_true() -> bool {
67 true
68}
69
70impl Default for SmartChunkerConfig {
71 fn default() -> Self {
72 Self {
73 min_function_lines: default_min_function_lines(),
74 min_file_lines_to_split: default_min_file_lines_to_split(),
75 max_chunk_lines: default_max_chunk_lines(),
76 extract_utilities: true,
77 extract_types: true,
78 group_related: true,
79 prefer_atomic: false,
80 namespace: None,
81 }
82 }
83}
84
85#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct CodeEntity {
88 pub name: String,
89 pub kind: EntityKind,
90 pub start_line: usize,
91 pub end_line: usize,
92 pub visibility: Visibility,
93 pub doc_comment: Option<String>,
94 pub imports: Vec<String>,
95 pub exports: Vec<String>,
96 pub calls: Vec<String>,
97 pub complexity: u32,
98}
99
100#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
102pub enum EntityKind {
103 Function,
104 AsyncFunction,
105 Method,
106 Struct,
107 Class,
108 Trait,
109 Interface,
110 Enum,
111 Constant,
112 Module,
113 Type,
114 Test,
115 Macro,
116}
117
118#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
120pub enum Visibility {
121 Public,
122 Private,
123 Internal,
124}
125
126#[derive(Debug, Clone, Serialize, Deserialize)]
128pub struct FileAnalysis {
129 pub path: PathBuf,
130 pub language: String,
131 pub total_lines: usize,
132 pub entities: Vec<CodeEntity>,
133 pub imports: Vec<String>,
134 pub exports: Vec<String>,
135 pub is_entrypoint: bool,
136 pub is_test: bool,
137 pub is_config: bool,
138 pub framework_hints: Vec<String>,
139 pub category: ChunkCategory,
140}
141
142#[derive(Debug, Clone, Serialize, Deserialize)]
144pub struct ChunkingDecision {
145 pub file_path: PathBuf,
146 pub strategy: ChunkingStrategy,
147 pub suggested_chunks: Vec<SuggestedChunk>,
148 pub reasoning: String,
149}
150
151#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
153pub enum ChunkingStrategy {
154 Atomic,
156 ByEntity,
158 BySections,
160 Hierarchical,
162 Skip,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct SuggestedChunk {
169 pub name: String,
170 pub alias: String,
171 pub start_line: usize,
172 pub end_line: usize,
173 pub granularity: ChunkGranularity,
174 pub category: ChunkCategory,
175 pub concepts: Vec<String>,
176 pub requires: Vec<String>,
177 pub provides: Vec<String>,
178}
179
180pub struct SmartChunker {
182 config: SmartChunkerConfig,
183}
184
185impl SmartChunker {
186 pub fn new(config: SmartChunkerConfig) -> Self {
188 Self { config }
189 }
190
191 pub fn default() -> Self {
193 Self::new(SmartChunkerConfig::default())
194 }
195
196 pub fn analyze_file(&self, path: &Path, content: &str) -> FileAnalysis {
198 let language = self.detect_language(path);
199 let lines: Vec<&str> = content.lines().collect();
200 let total_lines = lines.len();
201
202 let entities = self.extract_entities(content, &language);
203 let imports = self.extract_imports(content, &language);
204 let exports = self.extract_exports(content, &language, &entities);
205 let is_entrypoint = self.is_entrypoint(path, content, &language);
206 let is_test = self.is_test_file(path, content, &language);
207 let is_config = self.is_config_file(path);
208 let framework_hints = self.detect_frameworks(content, &language);
209 let category = self.categorize_file(path, &entities, is_test, is_config);
210
211 FileAnalysis {
212 path: path.to_path_buf(),
213 language,
214 total_lines,
215 entities,
216 imports,
217 exports,
218 is_entrypoint,
219 is_test,
220 is_config,
221 framework_hints,
222 category,
223 }
224 }
225
226 pub fn decide_chunking(&self, analysis: &FileAnalysis) -> ChunkingDecision {
228 let strategy;
229 let reasoning;
230 let mut suggested_chunks = Vec::new();
231
232 if analysis.total_lines < self.config.min_file_lines_to_split {
234 strategy = ChunkingStrategy::Atomic;
235 reasoning = format!(
236 "File has {} lines (< {}), keeping atomic",
237 analysis.total_lines, self.config.min_file_lines_to_split
238 );
239 suggested_chunks.push(self.create_file_chunk(analysis));
240 }
241 else if analysis.is_config {
243 strategy = ChunkingStrategy::Atomic;
244 reasoning = "Configuration file, keeping atomic".to_string();
245 suggested_chunks.push(self.create_file_chunk(analysis));
246 }
247 else if analysis.entities.len() == 1 {
249 strategy = ChunkingStrategy::Atomic;
250 reasoning = "Single entity file, keeping atomic".to_string();
251 suggested_chunks.push(self.create_file_chunk(analysis));
252 }
253 else if analysis.entities.len() > 1 && self.has_clear_structure(&analysis.entities) {
255 strategy = ChunkingStrategy::ByEntity;
256 reasoning = format!(
257 "Found {} distinct entities, splitting by entity",
258 analysis.entities.len()
259 );
260 suggested_chunks = self.create_entity_chunks(analysis);
261 }
262 else if analysis.total_lines > self.config.max_chunk_lines {
264 strategy = ChunkingStrategy::Hierarchical;
265 reasoning = format!(
266 "Large file ({} lines), creating hierarchical chunks",
267 analysis.total_lines
268 );
269 suggested_chunks.push(self.create_file_chunk(analysis));
271 suggested_chunks.extend(self.create_entity_chunks(analysis));
273 }
274 else if self.config.prefer_atomic {
276 strategy = ChunkingStrategy::Atomic;
277 reasoning = "Preferring atomic chunks".to_string();
278 suggested_chunks.push(self.create_file_chunk(analysis));
279 } else {
280 strategy = ChunkingStrategy::ByEntity;
281 reasoning = "Splitting by code entities".to_string();
282 suggested_chunks = self.create_entity_chunks(analysis);
283 }
284
285 ChunkingDecision {
286 file_path: analysis.path.clone(),
287 strategy,
288 suggested_chunks,
289 reasoning,
290 }
291 }
292
293 pub fn generate_chunks(
295 &self,
296 path: &Path,
297 content: &str,
298 decision: &ChunkingDecision,
299 ) -> Vec<AtomicChunk> {
300 let mut chunks = Vec::new();
301 let lines: Vec<&str> = content.lines().collect();
302
303 for suggested in &decision.suggested_chunks {
304 let chunk_content = if suggested.start_line == 0 && suggested.end_line >= lines.len() {
305 content.to_string()
306 } else {
307 let start = suggested.start_line.saturating_sub(1);
308 let end = suggested.end_line.min(lines.len());
309 lines[start..end].join("\n")
310 };
311
312 let content_hash = compute_hash(&chunk_content);
313 let chunk_id = format!("chunk:sha256:{}", content_hash);
314
315 let analysis = self.analyze_file(path, content);
316
317 let mut chunk = AtomicChunk::new(
318 chunk_id,
319 suggested.name.clone(),
320 analysis.language.clone(),
321 content_hash,
322 chunk_content.len(),
323 )
324 .with_alias(&suggested.alias)
325 .with_granularity(suggested.granularity)
326 .with_categories(vec![suggested.category.clone()])
327 .with_concepts(suggested.concepts.clone());
328
329 chunk.provides = suggested.provides.clone();
330 chunk.requires = suggested.requires.clone();
331 chunk.sources = vec![SourceLocation {
332 file: path.to_string_lossy().to_string(),
333 start_line: Some(suggested.start_line),
334 end_line: Some(suggested.end_line),
335 start_col: None,
336 end_col: None,
337 }];
338 chunk.metrics = ChunkMetrics {
339 loc: suggested.end_line - suggested.start_line + 1,
340 ..Default::default()
341 };
342
343 chunks.push(chunk);
344 }
345
346 chunks
347 }
348
349 fn detect_language(&self, path: &Path) -> String {
354 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
355 match ext {
356 "rs" => "rust".to_string(),
357 "ts" | "tsx" => "typescript".to_string(),
358 "js" | "jsx" | "mjs" | "cjs" => "javascript".to_string(),
359 "py" | "pyi" => "python".to_string(),
360 "go" => "go".to_string(),
361 "c" | "h" => "c".to_string(),
362 "cpp" | "cc" | "cxx" | "hpp" | "hh" => "cpp".to_string(),
363 "java" => "java".to_string(),
364 "rb" => "ruby".to_string(),
365 "swift" => "swift".to_string(),
366 "kt" | "kts" => "kotlin".to_string(),
367 "cs" => "csharp".to_string(),
368 "php" => "php".to_string(),
369 "scala" => "scala".to_string(),
370 "zig" => "zig".to_string(),
371 "md" | "markdown" => "markdown".to_string(),
372 "json" => "json".to_string(),
373 "yaml" | "yml" => "yaml".to_string(),
374 "toml" => "toml".to_string(),
375 "html" | "htm" => "html".to_string(),
376 "css" => "css".to_string(),
377 "scss" | "sass" => "scss".to_string(),
378 "sql" => "sql".to_string(),
379 "sh" | "bash" | "zsh" => "shell".to_string(),
380 "dockerfile" => "dockerfile".to_string(),
381 _ => "unknown".to_string(),
382 }
383 }
384
385 fn extract_entities(&self, content: &str, language: &str) -> Vec<CodeEntity> {
386 let mut entities = Vec::new();
387 let lines: Vec<&str> = content.lines().collect();
388
389 match language {
390 "rust" => self.extract_rust_entities(&lines, &mut entities),
391 "typescript" | "javascript" => self.extract_ts_entities(&lines, &mut entities),
392 "python" => self.extract_python_entities(&lines, &mut entities),
393 "go" => self.extract_go_entities(&lines, &mut entities),
394 _ => {}
395 }
396
397 entities
398 }
399
400 fn extract_rust_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
401 let mut current_entity: Option<(String, EntityKind, usize, Visibility, Option<String>)> =
402 None;
403 let mut brace_depth = 0;
404 let mut doc_comment = String::new();
405
406 for (i, line) in lines.iter().enumerate() {
407 let trimmed = line.trim();
408
409 if trimmed.starts_with("///") || trimmed.starts_with("//!") {
411 doc_comment.push_str(trimmed.trim_start_matches('/').trim());
412 doc_comment.push('\n');
413 continue;
414 }
415
416 let (is_pub, rest) = if trimmed.starts_with("pub ") {
418 (true, &trimmed[4..])
419 } else {
420 (false, trimmed)
421 };
422
423 let visibility = if is_pub {
424 Visibility::Public
425 } else {
426 Visibility::Private
427 };
428
429 if rest.starts_with("fn ")
430 || rest.starts_with("async fn ")
431 || rest.starts_with("const fn ")
432 || rest.starts_with("unsafe fn ")
433 {
434 let is_async = rest.starts_with("async");
435 if let Some(name) = self.extract_rust_fn_name(rest) {
436 if current_entity.is_some() {
437 self.close_entity(current_entity.take(), i, entities);
438 }
439 let kind = if is_async {
440 EntityKind::AsyncFunction
441 } else {
442 EntityKind::Function
443 };
444 let doc = if doc_comment.is_empty() {
445 None
446 } else {
447 Some(doc_comment.trim().to_string())
448 };
449 current_entity = Some((name, kind, i + 1, visibility, doc));
450 brace_depth = 0;
451 }
452 } else if rest.starts_with("struct ") {
453 if let Some(name) = self.extract_rust_type_name(rest, "struct ") {
454 if current_entity.is_some() {
455 self.close_entity(current_entity.take(), i, entities);
456 }
457 let doc = if doc_comment.is_empty() {
458 None
459 } else {
460 Some(doc_comment.trim().to_string())
461 };
462 current_entity = Some((name, EntityKind::Struct, i + 1, visibility, doc));
463 brace_depth = 0;
464 }
465 } else if rest.starts_with("enum ") {
466 if let Some(name) = self.extract_rust_type_name(rest, "enum ") {
467 if current_entity.is_some() {
468 self.close_entity(current_entity.take(), i, entities);
469 }
470 let doc = if doc_comment.is_empty() {
471 None
472 } else {
473 Some(doc_comment.trim().to_string())
474 };
475 current_entity = Some((name, EntityKind::Enum, i + 1, visibility, doc));
476 brace_depth = 0;
477 }
478 } else if rest.starts_with("trait ") {
479 if let Some(name) = self.extract_rust_type_name(rest, "trait ") {
480 if current_entity.is_some() {
481 self.close_entity(current_entity.take(), i, entities);
482 }
483 let doc = if doc_comment.is_empty() {
484 None
485 } else {
486 Some(doc_comment.trim().to_string())
487 };
488 current_entity = Some((name, EntityKind::Trait, i + 1, visibility, doc));
489 brace_depth = 0;
490 }
491 } else if rest.starts_with("impl ") || rest.starts_with("impl<") {
492 if let Some(name) = self.extract_impl_name(rest) {
493 if current_entity.is_some() {
494 self.close_entity(current_entity.take(), i, entities);
495 }
496 let doc = if doc_comment.is_empty() {
497 None
498 } else {
499 Some(doc_comment.trim().to_string())
500 };
501 current_entity =
502 Some((format!("impl_{}", name), EntityKind::Module, i + 1, visibility, doc));
503 brace_depth = 0;
504 }
505 }
506
507 brace_depth += trimmed.matches('{').count() as i32;
509 brace_depth -= trimmed.matches('}').count() as i32;
510
511 if brace_depth <= 0 && current_entity.is_some() {
512 self.close_entity(current_entity.take(), i + 1, entities);
513 }
514
515 if !trimmed.starts_with("///")
517 && !trimmed.starts_with("//!")
518 && !trimmed.starts_with("#[")
519 && !trimmed.is_empty()
520 {
521 doc_comment.clear();
522 }
523 }
524
525 if let Some(entity) = current_entity {
527 self.close_entity(Some(entity), lines.len(), entities);
528 }
529 }
530
531 fn extract_rust_fn_name(&self, line: &str) -> Option<String> {
532 let rest = line
533 .trim_start_matches("async ")
534 .trim_start_matches("const ")
535 .trim_start_matches("unsafe ")
536 .trim_start_matches("fn ");
537 let name_end = rest.find('(').or_else(|| rest.find('<'))?;
538 Some(rest[..name_end].trim().to_string())
539 }
540
541 fn extract_rust_type_name(&self, line: &str, prefix: &str) -> Option<String> {
542 let rest = line.trim_start_matches(prefix);
543 let name_end = rest
544 .find(|c: char| !c.is_alphanumeric() && c != '_')
545 .unwrap_or(rest.len());
546 if name_end > 0 {
547 Some(rest[..name_end].to_string())
548 } else {
549 None
550 }
551 }
552
553 fn extract_impl_name(&self, line: &str) -> Option<String> {
554 let rest = line.trim_start_matches("impl").trim_start_matches('<');
556 let rest = if let Some(idx) = rest.find('>') {
558 &rest[idx + 1..]
559 } else {
560 rest
561 };
562 let rest = rest.trim();
563
564 if let Some(idx) = rest.find(" for ") {
565 let type_name = rest[idx + 5..].split_whitespace().next()?;
567 Some(type_name.to_string())
568 } else {
569 let type_name = rest.split_whitespace().next()?;
571 Some(type_name.to_string())
572 }
573 }
574
575 fn extract_ts_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
576 let mut current_entity: Option<(String, EntityKind, usize, Visibility, Option<String>)> =
577 None;
578 let mut brace_depth = 0;
579
580 for (i, line) in lines.iter().enumerate() {
581 let trimmed = line.trim();
582
583 let is_export = trimmed.starts_with("export ");
585 let rest = if is_export {
586 &trimmed[7..]
587 } else {
588 trimmed
589 };
590
591 let visibility = if is_export {
592 Visibility::Public
593 } else {
594 Visibility::Private
595 };
596
597 if rest.starts_with("function ")
598 || rest.starts_with("async function ")
599 || (rest.starts_with("const ") && rest.contains("=>"))
600 {
601 let is_async = rest.contains("async");
602 if let Some(name) = self.extract_ts_fn_name(rest) {
603 if current_entity.is_some() {
604 self.close_entity(current_entity.take(), i, entities);
605 }
606 let kind = if is_async {
607 EntityKind::AsyncFunction
608 } else {
609 EntityKind::Function
610 };
611 current_entity = Some((name, kind, i + 1, visibility, None));
612 brace_depth = 0;
613 }
614 } else if rest.starts_with("class ") {
615 if let Some(name) = self.extract_ts_class_name(rest) {
616 if current_entity.is_some() {
617 self.close_entity(current_entity.take(), i, entities);
618 }
619 current_entity = Some((name, EntityKind::Class, i + 1, visibility, None));
620 brace_depth = 0;
621 }
622 } else if rest.starts_with("interface ") {
623 if let Some(name) = self.extract_ts_interface_name(rest) {
624 if current_entity.is_some() {
625 self.close_entity(current_entity.take(), i, entities);
626 }
627 current_entity = Some((name, EntityKind::Interface, i + 1, visibility, None));
628 brace_depth = 0;
629 }
630 } else if rest.starts_with("type ") {
631 if let Some(name) = self.extract_ts_type_name(rest) {
632 if current_entity.is_some() {
633 self.close_entity(current_entity.take(), i, entities);
634 }
635 current_entity = Some((name, EntityKind::Type, i + 1, visibility, None));
636 brace_depth = 0;
637 }
638 } else if rest.starts_with("enum ") {
639 if let Some(name) = self.extract_ts_enum_name(rest) {
640 if current_entity.is_some() {
641 self.close_entity(current_entity.take(), i, entities);
642 }
643 current_entity = Some((name, EntityKind::Enum, i + 1, visibility, None));
644 brace_depth = 0;
645 }
646 }
647
648 brace_depth += trimmed.matches('{').count() as i32;
650 brace_depth -= trimmed.matches('}').count() as i32;
651
652 if brace_depth <= 0 && current_entity.is_some() && trimmed.contains('}') {
653 self.close_entity(current_entity.take(), i + 1, entities);
654 }
655 }
656
657 if let Some(entity) = current_entity {
658 self.close_entity(Some(entity), lines.len(), entities);
659 }
660 }
661
662 fn extract_ts_fn_name(&self, line: &str) -> Option<String> {
663 if line.starts_with("const ") {
664 let rest = line.trim_start_matches("const ");
666 let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
667 return Some(rest[..name_end].to_string());
668 }
669 let rest = line
670 .trim_start_matches("async ")
671 .trim_start_matches("function ");
672 let name_end = rest.find('(')?;
673 Some(rest[..name_end].trim().to_string())
674 }
675
676 fn extract_ts_class_name(&self, line: &str) -> Option<String> {
677 let rest = line.trim_start_matches("class ");
678 let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
679 Some(rest[..name_end].to_string())
680 }
681
682 fn extract_ts_interface_name(&self, line: &str) -> Option<String> {
683 let rest = line.trim_start_matches("interface ");
684 let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
685 Some(rest[..name_end].to_string())
686 }
687
688 fn extract_ts_type_name(&self, line: &str) -> Option<String> {
689 let rest = line.trim_start_matches("type ");
690 let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
691 Some(rest[..name_end].to_string())
692 }
693
694 fn extract_ts_enum_name(&self, line: &str) -> Option<String> {
695 let rest = line.trim_start_matches("enum ");
696 let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
697 Some(rest[..name_end].to_string())
698 }
699
700 fn extract_python_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
701 let mut current_entity: Option<(String, EntityKind, usize, usize, Visibility)> = None;
702
703 for (i, line) in lines.iter().enumerate() {
704 let trimmed = line.trim();
705 let indent = line.len() - line.trim_start().len();
706
707 if indent == 0 {
709 if let Some((name, kind, start, _, vis)) = current_entity.take() {
711 entities.push(CodeEntity {
712 name,
713 kind,
714 start_line: start,
715 end_line: i,
716 visibility: vis,
717 doc_comment: None,
718 imports: Vec::new(),
719 exports: Vec::new(),
720 calls: Vec::new(),
721 complexity: 1,
722 });
723 }
724
725 if trimmed.starts_with("def ") || trimmed.starts_with("async def ") {
726 let is_async = trimmed.starts_with("async");
727 if let Some(name) = self.extract_python_fn_name(trimmed) {
728 let visibility = if name.starts_with('_') {
729 Visibility::Private
730 } else {
731 Visibility::Public
732 };
733 let kind = if is_async {
734 EntityKind::AsyncFunction
735 } else {
736 EntityKind::Function
737 };
738 current_entity = Some((name, kind, i + 1, indent, visibility));
739 }
740 } else if trimmed.starts_with("class ") {
741 if let Some(name) = self.extract_python_class_name(trimmed) {
742 let visibility = if name.starts_with('_') {
743 Visibility::Private
744 } else {
745 Visibility::Public
746 };
747 current_entity = Some((name, EntityKind::Class, i + 1, indent, visibility));
748 }
749 }
750 }
751 }
752
753 if let Some((name, kind, start, _, vis)) = current_entity {
755 entities.push(CodeEntity {
756 name,
757 kind,
758 start_line: start,
759 end_line: lines.len(),
760 visibility: vis,
761 doc_comment: None,
762 imports: Vec::new(),
763 exports: Vec::new(),
764 calls: Vec::new(),
765 complexity: 1,
766 });
767 }
768 }
769
770 fn extract_python_fn_name(&self, line: &str) -> Option<String> {
771 let rest = line
772 .trim_start_matches("async ")
773 .trim_start_matches("def ");
774 let name_end = rest.find('(')?;
775 Some(rest[..name_end].trim().to_string())
776 }
777
778 fn extract_python_class_name(&self, line: &str) -> Option<String> {
779 let rest = line.trim_start_matches("class ");
780 let name_end = rest.find(['(', ':'])?;
781 Some(rest[..name_end].trim().to_string())
782 }
783
784 fn extract_go_entities(&self, lines: &[&str], entities: &mut Vec<CodeEntity>) {
785 let mut current_entity: Option<(String, EntityKind, usize, Visibility)> = None;
786 let mut brace_depth = 0;
787
788 for (i, line) in lines.iter().enumerate() {
789 let trimmed = line.trim();
790
791 if trimmed.starts_with("func ") {
792 if current_entity.is_some() {
793 self.close_entity_simple(current_entity.take(), i, entities);
794 }
795 if let Some(name) = self.extract_go_fn_name(trimmed) {
796 let visibility = if name.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
797 {
798 Visibility::Public
799 } else {
800 Visibility::Private
801 };
802 current_entity = Some((name, EntityKind::Function, i + 1, visibility));
803 brace_depth = 0;
804 }
805 } else if trimmed.starts_with("type ") && trimmed.contains("struct") {
806 if current_entity.is_some() {
807 self.close_entity_simple(current_entity.take(), i, entities);
808 }
809 if let Some(name) = self.extract_go_type_name(trimmed) {
810 let visibility = if name.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
811 {
812 Visibility::Public
813 } else {
814 Visibility::Private
815 };
816 current_entity = Some((name, EntityKind::Struct, i + 1, visibility));
817 brace_depth = 0;
818 }
819 } else if trimmed.starts_with("type ") && trimmed.contains("interface") {
820 if current_entity.is_some() {
821 self.close_entity_simple(current_entity.take(), i, entities);
822 }
823 if let Some(name) = self.extract_go_type_name(trimmed) {
824 let visibility = if name.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
825 {
826 Visibility::Public
827 } else {
828 Visibility::Private
829 };
830 current_entity = Some((name, EntityKind::Interface, i + 1, visibility));
831 brace_depth = 0;
832 }
833 }
834
835 brace_depth += trimmed.matches('{').count() as i32;
836 brace_depth -= trimmed.matches('}').count() as i32;
837
838 if brace_depth <= 0 && current_entity.is_some() && trimmed.contains('}') {
839 self.close_entity_simple(current_entity.take(), i + 1, entities);
840 }
841 }
842
843 if let Some(entity) = current_entity {
844 self.close_entity_simple(Some(entity), lines.len(), entities);
845 }
846 }
847
848 fn extract_go_fn_name(&self, line: &str) -> Option<String> {
849 let rest = line.trim_start_matches("func ");
850 let rest = if rest.starts_with('(') {
852 if let Some(idx) = rest.find(')') {
853 &rest[idx + 1..]
854 } else {
855 rest
856 }
857 } else {
858 rest
859 };
860 let rest = rest.trim();
861 let name_end = rest.find('(')?;
862 Some(rest[..name_end].trim().to_string())
863 }
864
865 fn extract_go_type_name(&self, line: &str) -> Option<String> {
866 let rest = line.trim_start_matches("type ");
867 let name_end = rest.find(|c: char| !c.is_alphanumeric() && c != '_')?;
868 Some(rest[..name_end].to_string())
869 }
870
871 fn close_entity(
872 &self,
873 entity: Option<(String, EntityKind, usize, Visibility, Option<String>)>,
874 end_line: usize,
875 entities: &mut Vec<CodeEntity>,
876 ) {
877 if let Some((name, kind, start, visibility, doc)) = entity {
878 entities.push(CodeEntity {
879 name,
880 kind,
881 start_line: start,
882 end_line,
883 visibility,
884 doc_comment: doc,
885 imports: Vec::new(),
886 exports: Vec::new(),
887 calls: Vec::new(),
888 complexity: 1,
889 });
890 }
891 }
892
893 fn close_entity_simple(
894 &self,
895 entity: Option<(String, EntityKind, usize, Visibility)>,
896 end_line: usize,
897 entities: &mut Vec<CodeEntity>,
898 ) {
899 if let Some((name, kind, start, visibility)) = entity {
900 entities.push(CodeEntity {
901 name,
902 kind,
903 start_line: start,
904 end_line,
905 visibility,
906 doc_comment: None,
907 imports: Vec::new(),
908 exports: Vec::new(),
909 calls: Vec::new(),
910 complexity: 1,
911 });
912 }
913 }
914
915 fn extract_imports(&self, content: &str, language: &str) -> Vec<String> {
916 let mut imports = Vec::new();
917
918 match language {
919 "rust" => {
920 for line in content.lines() {
921 let trimmed = line.trim();
922 if trimmed.starts_with("use ") {
923 let import = trimmed.trim_start_matches("use ").trim_end_matches(';');
924 imports.push(import.to_string());
925 }
926 }
927 }
928 "typescript" | "javascript" => {
929 for line in content.lines() {
930 let trimmed = line.trim();
931 if trimmed.starts_with("import ") {
932 imports.push(trimmed.to_string());
933 }
934 }
935 }
936 "python" => {
937 for line in content.lines() {
938 let trimmed = line.trim();
939 if trimmed.starts_with("import ") || trimmed.starts_with("from ") {
940 imports.push(trimmed.to_string());
941 }
942 }
943 }
944 "go" => {
945 for line in content.lines() {
946 let trimmed = line.trim();
947 if trimmed.starts_with("import ") || trimmed.starts_with("\"") {
948 imports.push(trimmed.to_string());
949 }
950 }
951 }
952 _ => {}
953 }
954
955 imports
956 }
957
958 fn extract_exports(&self, _content: &str, _language: &str, entities: &[CodeEntity]) -> Vec<String> {
959 entities
960 .iter()
961 .filter(|e| e.visibility == Visibility::Public)
962 .map(|e| e.name.clone())
963 .collect()
964 }
965
966 fn is_entrypoint(&self, path: &Path, content: &str, language: &str) -> bool {
967 let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
968
969 match language {
970 "rust" => file_name == "main.rs" || content.contains("fn main()"),
971 "typescript" | "javascript" => {
972 file_name == "index.ts"
973 || file_name == "index.tsx"
974 || file_name == "index.js"
975 || file_name == "main.ts"
976 }
977 "python" => {
978 file_name == "__main__.py"
979 || file_name == "main.py"
980 || content.contains("if __name__")
981 }
982 "go" => file_name == "main.go" || content.contains("func main()"),
983 _ => false,
984 }
985 }
986
987 fn is_test_file(&self, path: &Path, content: &str, language: &str) -> bool {
988 let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
989 let path_str = path.to_string_lossy();
990
991 match language {
992 "rust" => {
993 path_str.contains("/tests/")
994 || content.contains("#[test]")
995 || content.contains("#[cfg(test)]")
996 }
997 "typescript" | "javascript" => {
998 file_name.contains(".test.")
999 || file_name.contains(".spec.")
1000 || path_str.contains("__tests__")
1001 }
1002 "python" => {
1003 file_name.starts_with("test_")
1004 || file_name.ends_with("_test.py")
1005 || path_str.contains("/tests/")
1006 }
1007 "go" => file_name.ends_with("_test.go"),
1008 _ => false,
1009 }
1010 }
1011
1012 fn is_config_file(&self, path: &Path) -> bool {
1013 let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
1014 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
1015
1016 matches!(
1017 file_name,
1018 "Cargo.toml"
1019 | "package.json"
1020 | "tsconfig.json"
1021 | "pyproject.toml"
1022 | "setup.py"
1023 | "go.mod"
1024 | "CMakeLists.txt"
1025 | "Makefile"
1026 | ".eslintrc.json"
1027 | ".prettierrc"
1028 | "webpack.config.js"
1029 | "vite.config.ts"
1030 | "tailwind.config.js"
1031 ) || matches!(ext, "toml" | "yaml" | "yml" | "json")
1032 && (file_name.contains("config") || file_name.contains("settings"))
1033 }
1034
1035 fn detect_frameworks(&self, content: &str, language: &str) -> Vec<String> {
1036 let mut frameworks = Vec::new();
1037
1038 match language {
1039 "typescript" | "javascript" => {
1040 if content.contains("react") || content.contains("React") {
1041 frameworks.push("react".to_string());
1042 }
1043 if content.contains("@angular") {
1044 frameworks.push("angular".to_string());
1045 }
1046 if content.contains("vue") {
1047 frameworks.push("vue".to_string());
1048 }
1049 if content.contains("next") {
1050 frameworks.push("nextjs".to_string());
1051 }
1052 if content.contains("express") {
1053 frameworks.push("express".to_string());
1054 }
1055 }
1056 "rust" => {
1057 if content.contains("actix") {
1058 frameworks.push("actix".to_string());
1059 }
1060 if content.contains("tokio") {
1061 frameworks.push("tokio".to_string());
1062 }
1063 if content.contains("axum") {
1064 frameworks.push("axum".to_string());
1065 }
1066 if content.contains("serde") {
1067 frameworks.push("serde".to_string());
1068 }
1069 }
1070 "python" => {
1071 if content.contains("django") {
1072 frameworks.push("django".to_string());
1073 }
1074 if content.contains("flask") {
1075 frameworks.push("flask".to_string());
1076 }
1077 if content.contains("fastapi") {
1078 frameworks.push("fastapi".to_string());
1079 }
1080 }
1081 _ => {}
1082 }
1083
1084 frameworks
1085 }
1086
1087 fn categorize_file(
1088 &self,
1089 path: &Path,
1090 entities: &[CodeEntity],
1091 is_test: bool,
1092 is_config: bool,
1093 ) -> ChunkCategory {
1094 if is_test {
1095 return ChunkCategory::Test;
1096 }
1097 if is_config {
1098 return ChunkCategory::Config;
1099 }
1100
1101 let path_str = path.to_string_lossy().to_lowercase();
1102 let file_name = path
1103 .file_name()
1104 .and_then(|n| n.to_str())
1105 .unwrap_or("")
1106 .to_lowercase();
1107
1108 if path_str.contains("/api/") || path_str.contains("/routes/") {
1110 return ChunkCategory::Api;
1111 }
1112 if path_str.contains("/ui/")
1113 || path_str.contains("/components/")
1114 || path_str.contains("/views/")
1115 {
1116 return ChunkCategory::Ui;
1117 }
1118 if path_str.contains("/utils/") || path_str.contains("/helpers/") {
1119 return ChunkCategory::Utility;
1120 }
1121 if path_str.contains("/models/") || path_str.contains("/types/") {
1122 return ChunkCategory::Data;
1123 }
1124 if path_str.contains("/db/") || path_str.contains("/database/") {
1125 return ChunkCategory::Database;
1126 }
1127
1128 if file_name.contains("util") || file_name.contains("helper") {
1130 return ChunkCategory::Utility;
1131 }
1132 if file_name.contains("type") || file_name.contains("model") {
1133 return ChunkCategory::Data;
1134 }
1135
1136 let type_count = entities
1138 .iter()
1139 .filter(|e| {
1140 matches!(
1141 e.kind,
1142 EntityKind::Struct
1143 | EntityKind::Class
1144 | EntityKind::Interface
1145 | EntityKind::Enum
1146 | EntityKind::Type
1147 )
1148 })
1149 .count();
1150
1151 if type_count > entities.len() / 2 {
1152 return ChunkCategory::Data;
1153 }
1154
1155 ChunkCategory::Logic
1156 }
1157
1158 fn has_clear_structure(&self, entities: &[CodeEntity]) -> bool {
1159 let significant = entities
1161 .iter()
1162 .filter(|e| e.end_line - e.start_line >= self.config.min_function_lines)
1163 .count();
1164
1165 significant >= 2
1166 }
1167
1168 fn create_file_chunk(&self, analysis: &FileAnalysis) -> SuggestedChunk {
1169 let file_name = analysis
1170 .path
1171 .file_stem()
1172 .and_then(|n| n.to_str())
1173 .unwrap_or("unknown");
1174
1175 let alias = self.generate_alias(file_name, &analysis.path);
1176
1177 SuggestedChunk {
1178 name: file_name.to_string(),
1179 alias,
1180 start_line: 1,
1181 end_line: analysis.total_lines,
1182 granularity: ChunkGranularity::Module,
1183 category: analysis.category.clone(),
1184 concepts: analysis.exports.clone(),
1185 requires: analysis.imports.clone(),
1186 provides: analysis.exports.clone(),
1187 }
1188 }
1189
1190 fn create_entity_chunks(&self, analysis: &FileAnalysis) -> Vec<SuggestedChunk> {
1191 let file_stem = analysis
1192 .path
1193 .file_stem()
1194 .and_then(|n| n.to_str())
1195 .unwrap_or("unknown");
1196
1197 analysis
1198 .entities
1199 .iter()
1200 .filter(|e| e.end_line - e.start_line >= self.config.min_function_lines)
1201 .map(|entity| {
1202 let granularity = match entity.kind {
1203 EntityKind::Function | EntityKind::AsyncFunction | EntityKind::Method => {
1204 ChunkGranularity::Function
1205 }
1206 EntityKind::Struct
1207 | EntityKind::Class
1208 | EntityKind::Trait
1209 | EntityKind::Interface
1210 | EntityKind::Enum => ChunkGranularity::Type,
1211 EntityKind::Module => ChunkGranularity::Module,
1212 _ => ChunkGranularity::Function,
1213 };
1214
1215 let alias = format!("{}/{}", file_stem, to_kebab_case(&entity.name));
1216
1217 SuggestedChunk {
1218 name: entity.name.clone(),
1219 alias,
1220 start_line: entity.start_line,
1221 end_line: entity.end_line,
1222 granularity,
1223 category: analysis.category.clone(),
1224 concepts: vec![entity.name.clone()],
1225 requires: entity.imports.clone(),
1226 provides: if entity.visibility == Visibility::Public {
1227 vec![entity.name.clone()]
1228 } else {
1229 Vec::new()
1230 },
1231 }
1232 })
1233 .collect()
1234 }
1235
1236 fn generate_alias(&self, name: &str, path: &Path) -> String {
1237 let parent = path
1238 .parent()
1239 .and_then(|p| p.file_name())
1240 .and_then(|n| n.to_str())
1241 .unwrap_or("");
1242
1243 if parent.is_empty() || parent == "src" {
1244 to_kebab_case(name)
1245 } else {
1246 format!("{}/{}", to_kebab_case(parent), to_kebab_case(name))
1247 }
1248 }
1249}
1250
1251fn to_kebab_case(s: &str) -> String {
1253 let mut result = String::new();
1254 for (i, c) in s.chars().enumerate() {
1255 if c.is_uppercase() && i > 0 {
1256 result.push('-');
1257 }
1258 result.push(c.to_ascii_lowercase());
1259 }
1260 result
1261 .replace(['_', ' '], "-")
1262 .replace("--", "-")
1263}
1264
1265fn compute_hash(content: &str) -> String {
1267 let mut hasher = Sha256::new();
1268 hasher.update(content.as_bytes());
1269 let result = hasher.finalize();
1270 hex::encode(result)
1271}
1272
1273#[cfg(test)]
1274mod tests {
1275 use super::*;
1276
1277 #[test]
1278 fn test_language_detection() {
1279 let chunker = SmartChunker::default();
1280 assert_eq!(
1281 chunker.detect_language(Path::new("test.rs")),
1282 "rust"
1283 );
1284 assert_eq!(
1285 chunker.detect_language(Path::new("test.ts")),
1286 "typescript"
1287 );
1288 assert_eq!(
1289 chunker.detect_language(Path::new("test.py")),
1290 "python"
1291 );
1292 }
1293
1294 #[test]
1295 fn test_kebab_case() {
1296 assert_eq!(to_kebab_case("HelloWorld"), "hello-world");
1297 assert_eq!(to_kebab_case("hello_world"), "hello-world");
1298 assert_eq!(to_kebab_case("my-file"), "my-file");
1299 }
1300
1301 #[test]
1302 fn test_rust_entity_extraction() {
1303 let chunker = SmartChunker::default();
1304 let content = r#"
1305pub fn hello_world() {
1306 println!("Hello!");
1307}
1308
1309struct MyStruct {
1310 field: i32,
1311}
1312
1313impl MyStruct {
1314 fn new() -> Self {
1315 Self { field: 0 }
1316 }
1317}
1318"#;
1319 let lines: Vec<&str> = content.lines().collect();
1320 let mut entities = Vec::new();
1321 chunker.extract_rust_entities(&lines, &mut entities);
1322
1323 assert!(entities.len() >= 2);
1324 }
1325}