1use std::io::Write;
21use std::path::{Path, PathBuf};
22use std::sync::{
23 atomic::{AtomicUsize, Ordering},
24 Mutex,
25};
26
27use rayon::prelude::*;
28
29use crate::parser::{parse_file_symbols, Language};
30use crate::security::SecurityScanner;
31use crate::tokenizer::{TokenModel, Tokenizer};
32use crate::types::Symbol;
33
34use super::error::EmbedError;
35use super::git_enrichment::GitMetadataCollector;
36use super::hasher::hash_content;
37use super::hierarchy::{HierarchyBuilder, HierarchyConfig};
38use super::identifiers::extract_identifiers;
39use super::limits::ResourceLimits;
40use super::progress::ProgressReporter;
41use super::type_extraction;
42use super::types::{
43 default_repr, ChunkContext, ChunkKind, ChunkPart, ChunkSource, EmbedChunk, EmbedSettings,
44 RepoIdentifier, Visibility,
45};
46
47#[derive(Debug, Clone, Default)]
49pub struct StreamingStats {
50 pub total_files: usize,
52 pub files_processed: usize,
54 pub files_skipped: usize,
56 pub total_chunks: usize,
58 pub batches_processed: usize,
60 pub orphaned_chunks: u32,
62}
63
64pub struct EmbedChunker {
66 settings: EmbedSettings,
67 limits: ResourceLimits,
68 tokenizer: Tokenizer,
69 security_scanner: Option<SecurityScanner>,
70 repo_id: RepoIdentifier,
72}
73
74impl EmbedChunker {
75 pub fn new(settings: EmbedSettings, limits: ResourceLimits) -> Self {
77 let security_scanner = if settings.scan_secrets {
79 Some(SecurityScanner::new())
80 } else {
81 None
82 };
83
84 Self {
85 settings,
86 limits,
87 tokenizer: Tokenizer::new(),
88 security_scanner,
89 repo_id: RepoIdentifier::default(),
90 }
91 }
92
93 pub fn with_defaults(settings: EmbedSettings) -> Self {
95 Self::new(settings, ResourceLimits::default())
96 }
97
98 pub fn with_repo_id(mut self, repo_id: RepoIdentifier) -> Self {
112 self.repo_id = repo_id;
113 self
114 }
115
116 pub fn set_repo_id(&mut self, repo_id: RepoIdentifier) {
118 self.repo_id = repo_id;
119 }
120
121 pub fn repo_id(&self) -> &RepoIdentifier {
123 &self.repo_id
124 }
125
126 pub fn chunk_repository_filtered(
136 &self,
137 repo_path: &Path,
138 only_files: &std::collections::HashSet<PathBuf>,
139 progress: &dyn ProgressReporter,
140 ) -> Result<Vec<EmbedChunk>, EmbedError> {
141 let repo_root = self.validate_repo_path(repo_path)?;
143
144 progress.set_phase("Scanning repository (filtered)...");
146 let mut files = self.discover_files(&repo_root)?;
147
148 files.retain(|f| {
150 if let Ok(rel) = f.strip_prefix(&repo_root) {
151 only_files.contains(rel)
152 } else {
153 false
154 }
155 });
156
157 files.sort(); self.chunk_files_impl(files, &repo_root, progress)
161 }
162
163 fn populate_repo_identity(&mut self, repo_path: &Path) {
169 if !self.repo_id.name.is_empty() {
171 return;
172 }
173
174 let namespace = self.settings.repo_namespace.clone();
175 let name = self
176 .settings
177 .repo_name
178 .clone()
179 .or_else(|| {
180 repo_path
181 .file_name()
182 .and_then(|n| n.to_str())
183 .map(String::from)
184 })
185 .unwrap_or_else(|| "unknown".to_owned());
186
187 let (branch, commit) = match crate::git::GitRepo::open(repo_path) {
189 Ok(git) => {
190 let branch = git.current_branch().ok();
191 let commit = git.current_commit().ok();
192 (branch, commit)
193 },
194 Err(_) => (None, None),
195 };
196
197 self.repo_id = RepoIdentifier { namespace, name, version: None, branch, commit };
198 }
199
200 pub fn chunk_repository(
209 &mut self,
210 repo_path: &Path,
211 progress: &dyn ProgressReporter,
212 ) -> Result<Vec<EmbedChunk>, EmbedError> {
213 let repo_root = self.validate_repo_path(repo_path)?;
215
216 self.populate_repo_identity(&repo_root);
218
219 progress.set_phase("Scanning repository...");
221 let mut files = self.discover_files(&repo_root)?;
222 files.sort(); self.chunk_files_impl(files, &repo_root, progress)
225 }
226
227 fn chunk_files_impl(
229 &self,
230 files: Vec<PathBuf>,
231 repo_root: &Path,
232 progress: &dyn ProgressReporter,
233 ) -> Result<Vec<EmbedChunk>, EmbedError> {
234 progress.set_total(files.len());
235
236 if files.is_empty() {
237 return Err(EmbedError::NoChunksGenerated {
238 include_patterns: "default".to_owned(),
239 exclude_patterns: "default".to_owned(),
240 });
241 }
242
243 if !self.limits.check_file_count(files.len()) {
245 return Err(EmbedError::TooManyFiles {
246 count: files.len(),
247 max: self.limits.max_files,
248 });
249 }
250
251 progress.set_phase("Parsing and chunking...");
253 let chunk_count = Mutex::new(0usize);
254 let processed = AtomicUsize::new(0);
255
256 let results: Vec<Result<Vec<EmbedChunk>, (PathBuf, EmbedError)>> = files
258 .par_iter()
259 .map(|file| {
260 let result = self.chunk_file(file, repo_root);
261
262 let done = processed.fetch_add(1, Ordering::Relaxed) + 1;
264 progress.set_progress(done);
265
266 match result {
267 Ok(chunks) => {
268 let chunks_to_add = chunks.len();
271 let mut count = chunk_count.lock().unwrap_or_else(|e| e.into_inner());
272 let new_count = *count + chunks_to_add;
273
274 if !self.limits.check_chunk_count(new_count) {
276 return Err((
277 file.clone(),
278 EmbedError::TooManyChunks {
279 count: new_count,
280 max: self.limits.max_total_chunks,
281 },
282 ));
283 }
284
285 *count = new_count;
286 drop(count); Ok(chunks)
289 },
290 Err(e) => Err((file.clone(), e)),
291 }
292 })
293 .collect();
294
295 let mut all_chunks = Vec::new();
297 let mut errors = Vec::new();
298
299 for result in results {
300 match result {
301 Ok(chunks) => all_chunks.extend(chunks),
302 Err((path, err)) => errors.push((path, err)),
303 }
304 }
305
306 if !errors.is_empty() {
308 let critical: Vec<_> = errors
309 .iter()
310 .filter(|(_, e)| e.is_critical())
311 .cloned()
312 .collect();
313
314 if !critical.is_empty() {
315 return Err(EmbedError::from_file_errors(critical));
316 }
317
318 for (path, err) in &errors {
320 if err.is_skippable() {
321 progress.warn(&format!("Skipped {}: {}", path.display(), err));
322 }
323 }
324 }
325
326 if all_chunks.is_empty() {
328 return Err(EmbedError::NoChunksGenerated {
329 include_patterns: "default".to_owned(),
330 exclude_patterns: "default".to_owned(),
331 });
332 }
333
334 progress.set_phase("Building call graph...");
336 self.populate_called_by(&mut all_chunks);
337
338 progress.set_phase("Linking parent/children chunks...");
340 self.link_parent_children(&mut all_chunks, progress);
341
342 if self.settings.enable_hierarchy {
344 progress.set_phase("Building hierarchy summaries...");
345 let hierarchy_config = HierarchyConfig {
346 min_children_for_summary: self.settings.hierarchy_min_children,
347 ..Default::default()
348 };
349 let builder = HierarchyBuilder::with_config(hierarchy_config);
350
351 builder.enrich_chunks(&mut all_chunks);
353
354 let mut summaries = builder.build_hierarchy(&all_chunks);
356
357 let token_model = self.parse_token_model(&self.settings.token_model);
359 for summary in &mut summaries {
360 summary.tokens = self.tokenizer.count(&summary.content, token_model);
361 }
362
363 all_chunks.extend(summaries);
364 }
365
366 if self.settings.include_signatures {
368 progress.set_phase("Generating signature chunks...");
369 let signature_chunks = self.generate_signature_chunks(&all_chunks);
370 all_chunks.extend(signature_chunks);
371 }
372
373 progress.set_phase("Sorting chunks...");
378 all_chunks.par_sort_by(|a, b| {
379 a.source
380 .file
381 .cmp(&b.source.file)
382 .then_with(|| a.source.lines.0.cmp(&b.source.lines.0))
383 .then_with(|| a.source.lines.1.cmp(&b.source.lines.1))
384 .then_with(|| a.source.symbol.cmp(&b.source.symbol))
385 .then_with(|| a.id.cmp(&b.id)) });
387
388 if self.settings.git_metadata {
390 progress.set_phase("Collecting git metadata...");
391 self.enrich_with_git_metadata(&mut all_chunks, repo_root);
392 }
393
394 progress.set_phase("Complete");
395 Ok(all_chunks)
396 }
397
398 fn enrich_with_git_metadata(&self, chunks: &mut [EmbedChunk], repo_root: &Path) {
402 let mut collector = match GitMetadataCollector::new(repo_root) {
403 Some(c) => c,
404 None => return, };
406
407 for chunk in chunks.iter_mut() {
408 let metadata = collector.get_metadata(&chunk.source.file);
409 chunk.context.git = Some(metadata);
410 }
411 }
412 pub fn chunk_repository_streaming<W: Write>(
439 &self,
440 repo_path: &Path,
441 writer: &mut W,
442 progress: &dyn ProgressReporter,
443 ) -> Result<StreamingStats, EmbedError> {
444 let repo_root = self.validate_repo_path(repo_path)?;
446
447 progress.set_phase("Scanning repository...");
449 let mut files = self.discover_files(&repo_root)?;
450 files.sort(); progress.set_total(files.len());
452
453 if files.is_empty() {
454 return Err(EmbedError::NoChunksGenerated {
455 include_patterns: "default".to_owned(),
456 exclude_patterns: "default".to_owned(),
457 });
458 }
459
460 if !self.limits.check_file_count(files.len()) {
462 return Err(EmbedError::TooManyFiles {
463 count: files.len(),
464 max: self.limits.max_files,
465 });
466 }
467
468 let batch_size = if self.settings.batch_size == 0 {
469 500
470 } else {
471 self.settings.batch_size
472 };
473
474 let mut stats = StreamingStats { total_files: files.len(), ..Default::default() };
475
476 progress.set_phase("Parsing and chunking (streaming)...");
478 let total_chunk_count = Mutex::new(0usize);
479
480 for batch_files in files.chunks(batch_size) {
481 let processed_in_batch = AtomicUsize::new(0);
482
483 let results: Vec<Result<Vec<EmbedChunk>, (PathBuf, EmbedError)>> = batch_files
485 .par_iter()
486 .map(|file| {
487 let result = self.chunk_file(file, &repo_root);
488
489 let done = processed_in_batch.fetch_add(1, Ordering::Relaxed) + 1;
490 let global_done = stats.files_processed + done;
491 progress.set_progress(global_done);
492
493 match result {
494 Ok(chunks) => {
495 let chunks_to_add = chunks.len();
498 let mut count =
499 total_chunk_count.lock().unwrap_or_else(|e| e.into_inner());
500 let new_count = *count + chunks_to_add;
501
502 if !self.limits.check_chunk_count(new_count) {
504 return Err((
505 file.clone(),
506 EmbedError::TooManyChunks {
507 count: new_count,
508 max: self.limits.max_total_chunks,
509 },
510 ));
511 }
512
513 *count = new_count;
514 drop(count); Ok(chunks)
517 },
518 Err(e) => Err((file.clone(), e)),
519 }
520 })
521 .collect();
522
523 let mut batch_chunks = Vec::new();
525
526 for result in results {
527 match result {
528 Ok(chunks) => {
529 stats.files_processed += 1;
530 batch_chunks.extend(chunks);
531 },
532 Err((_path, err)) => {
533 if err.is_critical() {
534 return Err(err);
535 }
536 if err.is_skippable() {
537 stats.files_skipped += 1;
538 progress.warn(&format!("Skipped: {}", err));
539 }
540 },
541 }
542 }
543
544 self.populate_called_by(&mut batch_chunks);
549
550 self.link_parent_children(&mut batch_chunks, progress);
552
553 batch_chunks.sort_by(|a, b| {
555 a.source
556 .file
557 .cmp(&b.source.file)
558 .then_with(|| a.source.lines.0.cmp(&b.source.lines.0))
559 .then_with(|| a.source.lines.1.cmp(&b.source.lines.1))
560 .then_with(|| a.source.symbol.cmp(&b.source.symbol))
561 .then_with(|| a.id.cmp(&b.id))
562 });
563
564 for chunk in &batch_chunks {
566 let chunk_json = serde_json::json!({
567 "type": "chunk",
568 "data": chunk,
569 });
570 let line = serde_json::to_string(&chunk_json).map_err(|e| EmbedError::IoError {
571 path: repo_path.to_path_buf(),
572 source: std::io::Error::other(e),
573 })?;
574 writeln!(writer, "{}", line).map_err(|e| EmbedError::IoError {
575 path: repo_path.to_path_buf(),
576 source: e,
577 })?;
578 }
579
580 stats.total_chunks += batch_chunks.len();
581 stats.batches_processed += 1;
582
583 writer
586 .flush()
587 .map_err(|e| EmbedError::IoError { path: repo_path.to_path_buf(), source: e })?;
588
589 }
591
592 if stats.total_chunks == 0 {
593 return Err(EmbedError::NoChunksGenerated {
594 include_patterns: "default".to_owned(),
595 exclude_patterns: "default".to_owned(),
596 });
597 }
598
599 progress.set_phase("Complete");
600 Ok(stats)
601 }
602 fn populate_called_by(&self, chunks: &mut [EmbedChunk]) {
608 use super::import_resolver::ImportResolver;
609 use std::collections::{BTreeMap, BTreeSet};
610
611 let resolver = ImportResolver::from_chunks(chunks);
613 resolver.resolve_all_calls(chunks);
614
615 let qualified_reverse = resolver.build_qualified_reverse_map(chunks);
618
619 let mut reverse_calls: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
621 for chunk in chunks.iter() {
622 let caller_fqn = chunk.source.fqn.as_deref().unwrap_or(&chunk.source.symbol);
623 for callee in &chunk.context.calls {
624 reverse_calls
625 .entry(callee.clone())
626 .or_default()
627 .insert(caller_fqn.to_owned());
628 }
629 }
630
631 for chunk in chunks.iter_mut() {
633 let fqn = chunk.source.fqn.as_deref().unwrap_or("");
634 let symbol = &chunk.source.symbol;
635 let file = &chunk.source.file;
636
637 let mut called_by_set: BTreeSet<String> = BTreeSet::new();
638
639 let qualified_key = format!("{}::{}", file, symbol);
641 if let Some(callers) = qualified_reverse.get(&qualified_key) {
642 called_by_set.extend(callers.iter().cloned());
643 }
644
645 if !fqn.is_empty() {
647 if let Some(callers) = qualified_reverse.get(fqn) {
648 called_by_set.extend(callers.iter().cloned());
649 }
650 }
651
652 if let Some(callers) = reverse_calls.get(fqn) {
654 called_by_set.extend(callers.iter().cloned());
655 }
656 if let Some(callers) = reverse_calls.get(symbol) {
657 called_by_set.extend(callers.iter().cloned());
658 }
659
660 chunk.context.called_by = called_by_set.into_iter().collect();
661
662 let count = chunk.context.called_by.len() as u32;
664 if count > 0 {
665 chunk.context.dependents_count = Some(count);
666 }
667 }
668 }
669
670 fn link_parent_children(&self, chunks: &mut [EmbedChunk], progress: &dyn ProgressReporter) {
680 use std::collections::{BTreeMap, BTreeSet};
681
682 let mut container_map: BTreeMap<(String, String), usize> = BTreeMap::new();
684 for (i, chunk) in chunks.iter().enumerate() {
685 if matches!(
686 chunk.kind,
687 ChunkKind::Class
688 | ChunkKind::Struct
689 | ChunkKind::Enum
690 | ChunkKind::Trait
691 | ChunkKind::Interface
692 ) {
693 container_map.insert((chunk.source.file.clone(), chunk.source.symbol.clone()), i);
694 }
695 }
696
697 let mut parent_children: BTreeMap<usize, Vec<String>> = BTreeMap::new();
699 let mut orphaned_count: u32 = 0;
700 let mut orphaned_files: BTreeSet<String> = BTreeSet::new();
701
702 for i in 0..chunks.len() {
703 if let Some(ref parent_name) = chunks[i].source.parent {
704 let key = (chunks[i].source.file.clone(), parent_name.clone());
705 if let Some(&parent_idx) = container_map.get(&key) {
706 let parent_id = chunks[parent_idx].id.clone();
707 chunks[i].source.parent_chunk_id = Some(parent_id);
708
709 parent_children
710 .entry(parent_idx)
711 .or_default()
712 .push(chunks[i].id.clone());
713 } else {
714 orphaned_count += 1;
715 orphaned_files.insert(chunks[i].source.file.clone());
716 }
717 }
718 }
719
720 if orphaned_count > 0 {
722 progress.warn(&format!(
723 "{} chunks have missing parent containers across {} files",
724 orphaned_count,
725 orphaned_files.len()
726 ));
727 }
728
729 for (parent_idx, mut child_ids) in parent_children {
731 child_ids.sort();
732 chunks[parent_idx].children_ids = child_ids;
733 }
734 }
735
736 fn generate_signature_chunks(&self, chunks: &[EmbedChunk]) -> Vec<EmbedChunk> {
748 let token_model = self.parse_token_model(&self.settings.token_model);
749
750 chunks
751 .iter()
752 .filter(|chunk| {
753 chunk.repr == "code"
755 && chunk.code_chunk_id.is_none()
756 && chunk.part.is_none() && chunk.context.signature.is_some()
758 && !matches!(chunk.kind, ChunkKind::Imports | ChunkKind::TopLevel)
759 })
760 .filter_map(|chunk| {
761 let signature = chunk.context.signature.as_ref()?;
762 let hash = hash_content(signature);
763 let tokens = self.tokenizer.count(signature, token_model);
764
765 Some(EmbedChunk {
766 id: hash.short_id,
767 full_hash: hash.full_hash,
768 content: signature.clone(),
769 tokens,
770 kind: chunk.kind,
771 source: chunk.source.clone(),
772 context: ChunkContext {
773 signature: chunk.context.signature.clone(),
774 docstring: chunk.context.docstring.clone(),
775 context_prefix: chunk.context.context_prefix.clone(),
776 ..Default::default()
777 },
778 children_ids: Vec::new(),
779 repr: "signature".to_owned(),
780 code_chunk_id: Some(chunk.id.clone()),
781 part: None,
782 })
783 })
784 .collect()
785 }
786
787 fn chunk_file(&self, path: &Path, repo_root: &Path) -> Result<Vec<EmbedChunk>, EmbedError> {
789 let metadata = std::fs::metadata(path)
791 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
792
793 if !self.limits.check_file_size(metadata.len()) {
794 return Err(EmbedError::FileTooLarge {
795 path: path.to_path_buf(),
796 size: metadata.len(),
797 max: self.limits.max_file_size,
798 });
799 }
800
801 let mut content = std::fs::read_to_string(path)
803 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
804
805 if let Some(max_line_len) = content.lines().map(|l| l.len()).max() {
808 if !self.limits.check_line_length(max_line_len) {
809 return Err(EmbedError::LineTooLong {
810 path: path.to_path_buf(),
811 length: max_line_len,
812 max: self.limits.max_line_length,
813 });
814 }
815 }
816
817 let relative_path = self.safe_relative_path(path, repo_root)?;
819
820 if let Some(ref scanner) = self.security_scanner {
822 let findings = scanner.scan(&content, &relative_path);
823 if !findings.is_empty() {
824 if self.settings.fail_on_secrets {
826 let files = findings
827 .iter()
828 .map(|f| format!(" {}:{} - {}", f.file, f.line, f.kind.name()))
829 .collect::<Vec<_>>()
830 .join("\n");
831 return Err(EmbedError::SecretsDetected { count: findings.len(), files });
832 }
833
834 if self.settings.redact_secrets {
836 content = scanner.redact_content(&content, &relative_path);
837 }
838 }
839 }
840 let language = self.detect_language(path);
841 let lang_enum = self.detect_language_enum(path);
842
843 let mut symbols = parse_file_symbols(&content, path);
845
846 symbols.sort_by(|a, b| {
848 a.start_line
849 .cmp(&b.start_line)
850 .then_with(|| a.end_line.cmp(&b.end_line))
851 .then_with(|| a.name.cmp(&b.name))
852 });
853
854 let lines: Vec<&str> = content.lines().collect();
855 let mut chunks = Vec::with_capacity(symbols.len() + 2);
856
857 for symbol in &symbols {
858 if !self.settings.include_imports
860 && matches!(symbol.kind, crate::types::SymbolKind::Import)
861 {
862 continue;
863 }
864
865 let (chunk_content, start_line, end_line) =
867 self.extract_symbol_content(&lines, symbol, self.settings.context_lines);
868
869 let token_model = self.parse_token_model(&self.settings.token_model);
871 let tokens = self.tokenizer.count(&chunk_content, token_model);
872
873 if self.settings.max_tokens > 0 && tokens > self.settings.max_tokens {
875 let split_chunks = self.split_large_symbol(
876 &chunk_content,
877 symbol,
878 &relative_path,
879 &language,
880 start_line,
881 0, lang_enum,
883 )?;
884 chunks.extend(split_chunks);
885 } else {
886 let hash = hash_content(&chunk_content);
888
889 let mut context =
891 self.extract_context(symbol, &chunk_content, &relative_path, path);
892
893 let fqn = self.compute_fqn(&relative_path, symbol);
895
896 let chunk_kind: ChunkKind = symbol.kind.into();
897 let source = ChunkSource {
898 repo: self.repo_id.clone(),
899 file: relative_path.clone(),
900 lines: (start_line, end_line),
901 symbol: symbol.name.clone(),
902 fqn: Some(fqn),
903 language: language.clone(),
904 parent: symbol.parent.clone(),
905 visibility: symbol.visibility.into(),
906 is_test: self.is_test_code(path, symbol),
907 module_path: Some(derive_module_path(&relative_path, &language)),
908 parent_chunk_id: None,
909 };
910
911 context.summary = generate_summary(chunk_kind, &source, &context);
913
914 chunks.push(EmbedChunk {
915 id: hash.short_id,
916 full_hash: hash.full_hash,
917 content: chunk_content,
918 tokens,
919 kind: chunk_kind,
920 source,
921 context,
922 children_ids: Vec::new(),
923 repr: default_repr(),
924 code_chunk_id: None,
925 part: None,
926 });
927 }
928 }
929
930 if self.settings.include_top_level && !symbols.is_empty() {
932 if let Some(top_level) =
933 self.extract_top_level(&lines, &symbols, &relative_path, &language, lang_enum)
934 {
935 chunks.push(top_level);
936 }
937 }
938
939 Ok(chunks)
940 }
941
942 fn extract_symbol_content(
944 &self,
945 lines: &[&str],
946 symbol: &Symbol,
947 context_lines: u32,
948 ) -> (String, u32, u32) {
949 let start_line = symbol.start_line.saturating_sub(1) as usize;
951 let end_line = (symbol.end_line as usize).min(lines.len());
952
953 let context_start = start_line.saturating_sub(context_lines as usize);
955 let context_end = (end_line + context_lines as usize).min(lines.len());
956
957 let content = lines[context_start..context_end].join("\n");
959
960 (content, (context_start + 1) as u32, context_end as u32)
962 }
963
964 fn split_large_symbol(
971 &self,
972 content: &str,
973 symbol: &Symbol,
974 file: &str,
975 language: &str,
976 base_line: u32,
977 depth: u32,
978 lang_enum: Option<Language>,
979 ) -> Result<Vec<EmbedChunk>, EmbedError> {
980 if !self.limits.check_recursion_depth(depth) {
982 return Err(EmbedError::RecursionLimitExceeded {
983 depth,
984 max: self.limits.max_recursion_depth,
985 context: format!("splitting symbol {}", symbol.name),
986 });
987 }
988
989 let lines: Vec<&str> = content.lines().collect();
990 let total_lines = lines.len();
991
992 let token_model = self.parse_token_model(&self.settings.token_model);
994 let total_tokens = self.tokenizer.count(content, token_model) as usize;
995 let target_tokens = self.settings.max_tokens as usize;
996
997 if total_tokens == 0 || target_tokens == 0 {
998 return Ok(Vec::new());
999 }
1000
1001 let target_lines = ((total_lines * target_tokens) / total_tokens).max(1);
1003
1004 let overlap_tokens = self.settings.overlap_tokens as usize;
1007 let overlap_lines = if overlap_tokens > 0 && total_tokens > 0 {
1008 ((total_lines * overlap_tokens) / total_tokens)
1009 .max(1)
1010 .min(target_lines / 2)
1011 } else {
1012 0
1013 };
1014
1015 let mut chunks = Vec::new();
1016 let mut current_start = 0usize;
1017 let mut part_num = 1u32;
1018
1019 let parent_hash = hash_content(content);
1021
1022 while current_start < total_lines {
1023 let content_start = if part_num > 1 && overlap_lines > 0 {
1026 current_start.saturating_sub(overlap_lines)
1027 } else {
1028 current_start
1029 };
1030 let content_end = (current_start + target_lines).min(total_lines);
1031
1032 let part_content = lines[content_start..content_end].join("\n");
1033
1034 let tokens = self.tokenizer.count(&part_content, token_model);
1035
1036 if tokens >= self.settings.min_tokens {
1038 let hash = hash_content(&part_content);
1039 let part_keywords = extract_keywords(&part_content);
1040 let part_identifiers = extract_identifiers(&part_content, lang_enum);
1041 let part_prefix =
1042 Some(generate_context_prefix(file, Some(&symbol.name), &symbol.kind));
1043
1044 let actual_overlap = if part_num > 1 {
1046 current_start.saturating_sub(content_start) as u32
1047 } else {
1048 0
1049 };
1050
1051 let part_source = ChunkSource {
1052 repo: self.repo_id.clone(),
1053 file: file.to_owned(),
1054 lines: (base_line + content_start as u32, base_line + content_end as u32 - 1),
1055 symbol: format!("{}_part{}", symbol.name, part_num),
1056 fqn: None,
1057 language: language.to_owned(),
1058 parent: Some(symbol.name.clone()),
1059 visibility: symbol.visibility.into(),
1060 is_test: false,
1061 module_path: Some(derive_module_path(file, language)),
1062 parent_chunk_id: None,
1063 };
1064 let mut part_context = ChunkContext {
1065 signature: symbol.signature.clone(), docstring: symbol.docstring.clone(),
1069 keywords: part_keywords,
1070 identifiers: part_identifiers,
1071 context_prefix: part_prefix,
1072 ..Default::default()
1073 };
1074 part_context.summary =
1075 generate_summary(ChunkKind::FunctionPart, &part_source, &part_context);
1076
1077 chunks.push(EmbedChunk {
1078 id: hash.short_id,
1079 full_hash: hash.full_hash,
1080 content: part_content,
1081 tokens,
1082 kind: ChunkKind::FunctionPart, source: part_source,
1084 context: part_context,
1085 children_ids: Vec::new(),
1086 repr: default_repr(),
1087 code_chunk_id: None,
1088 part: Some(ChunkPart {
1089 part: part_num,
1090 of: 0, parent_id: parent_hash.short_id.clone(),
1092 parent_signature: symbol.signature.clone().unwrap_or_default(),
1093 overlap_lines: actual_overlap,
1094 }),
1095 });
1096
1097 part_num += 1;
1098 }
1099
1100 current_start = content_end;
1101 }
1102
1103 let total_parts = chunks.len() as u32;
1105 for chunk in &mut chunks {
1106 if let Some(ref mut part) = chunk.part {
1107 part.of = total_parts;
1108 }
1109 }
1110
1111 Ok(chunks)
1112 }
1113
1114 fn extract_top_level(
1116 &self,
1117 lines: &[&str],
1118 symbols: &[Symbol],
1119 file: &str,
1120 language: &str,
1121 lang_enum: Option<Language>,
1122 ) -> Option<EmbedChunk> {
1123 if lines.is_empty() || symbols.is_empty() {
1124 return None;
1125 }
1126
1127 let mut covered = vec![false; lines.len()];
1129 for symbol in symbols {
1130 let start = symbol.start_line.saturating_sub(1) as usize;
1131 let end = (symbol.end_line as usize).min(lines.len());
1132 for i in start..end {
1133 covered[i] = true;
1134 }
1135 }
1136
1137 let top_level_lines: Vec<&str> = lines
1139 .iter()
1140 .enumerate()
1141 .filter(|(i, _)| !covered[*i])
1142 .map(|(_, line)| *line)
1143 .collect();
1144
1145 if top_level_lines.is_empty() {
1146 return None;
1147 }
1148
1149 let content = top_level_lines.join("\n").trim().to_owned();
1150 if content.is_empty() {
1151 return None;
1152 }
1153
1154 let token_model = self.parse_token_model(&self.settings.token_model);
1155 let tokens = self.tokenizer.count(&content, token_model);
1156
1157 if tokens < self.settings.min_tokens {
1158 return None;
1159 }
1160
1161 let hash = hash_content(&content);
1162 let keywords = extract_keywords(&content);
1163 let top_identifiers = extract_identifiers(&content, lang_enum);
1164 let context_prefix =
1165 Some(generate_context_prefix(file, None, &crate::types::SymbolKind::Module));
1166
1167 let top_source = ChunkSource {
1168 repo: self.repo_id.clone(),
1169 file: file.to_owned(),
1170 lines: (1, lines.len() as u32),
1171 symbol: "<top_level>".to_owned(),
1172 fqn: None,
1173 language: language.to_owned(),
1174 parent: None,
1175 visibility: Visibility::Public,
1176 is_test: false,
1177 module_path: Some(derive_module_path(file, language)),
1178 parent_chunk_id: None,
1179 };
1180 let mut top_context = ChunkContext {
1181 keywords,
1182 identifiers: top_identifiers,
1183 context_prefix,
1184 ..Default::default()
1185 };
1186 top_context.summary = generate_summary(ChunkKind::TopLevel, &top_source, &top_context);
1187
1188 Some(EmbedChunk {
1189 id: hash.short_id,
1190 full_hash: hash.full_hash,
1191 content,
1192 tokens,
1193 kind: ChunkKind::TopLevel,
1194 source: top_source,
1195 context: top_context,
1196 children_ids: Vec::new(),
1197 repr: default_repr(),
1198 code_chunk_id: None,
1199 part: None,
1200 })
1201 }
1202
1203 fn extract_context(
1205 &self,
1206 symbol: &Symbol,
1207 content: &str,
1208 file_path: &str,
1209 source_path: &Path,
1210 ) -> ChunkContext {
1211 let lang = source_path
1213 .extension()
1214 .and_then(|e| e.to_str())
1215 .and_then(Language::from_extension);
1216
1217 let (type_signature, parameter_types, return_type, error_types) = if matches!(
1219 symbol.kind,
1220 crate::types::SymbolKind::Function | crate::types::SymbolKind::Method
1221 ) {
1222 if let Some(lang) = lang {
1223 if let Some(type_info) = type_extraction::extract_types(content, lang) {
1224 (
1225 type_info.type_signature,
1226 type_info.parameter_types,
1227 type_info.return_type,
1228 type_info.error_types,
1229 )
1230 } else {
1231 (None, Vec::new(), None, Vec::new())
1232 }
1233 } else {
1234 (None, Vec::new(), None, Vec::new())
1235 }
1236 } else {
1237 (None, Vec::new(), None, Vec::new())
1238 };
1239
1240 ChunkContext {
1241 docstring: symbol.docstring.clone(),
1242 comments: Vec::new(), signature: symbol.signature.clone(),
1244 calls: symbol.calls.clone(),
1245 called_by: Vec::new(), imports: Vec::new(), tags: self.generate_tags(symbol),
1248 keywords: extract_keywords(content),
1249 context_prefix: Some(generate_context_prefix(
1250 file_path,
1251 symbol.parent.as_deref(),
1252 &symbol.kind,
1253 )),
1254 summary: None, qualified_calls: Vec::new(), unresolved_calls: Vec::new(), identifiers: extract_identifiers(content, lang),
1258 type_signature,
1259 parameter_types,
1260 return_type,
1261 error_types,
1262 lines_of_code: self.count_lines_of_code(content),
1263 max_nesting_depth: self.calculate_nesting_depth(content),
1264 git: None, complexity_score: lang.and_then(|l| super::complexity::compute_complexity(content, l)),
1266 dependents_count: None,
1267 }
1268 }
1269
1270 fn count_lines_of_code(&self, content: &str) -> u32 {
1272 content
1273 .lines()
1274 .filter(|line| {
1275 let trimmed = line.trim();
1276 !trimmed.is_empty()
1278 && !trimmed.starts_with("//")
1279 && !trimmed.starts_with('#')
1280 && !trimmed.starts_with("/*")
1281 && !trimmed.starts_with('*')
1282 })
1283 .count() as u32
1284 }
1285
1286 fn calculate_nesting_depth(&self, content: &str) -> u32 {
1291 let brace_depth = self.calculate_brace_depth(content);
1293
1294 if brace_depth <= 1 {
1297 let indent_depth = self.calculate_indent_depth(content);
1298 brace_depth.max(indent_depth)
1300 } else {
1301 brace_depth
1302 }
1303 }
1304
1305 fn calculate_brace_depth(&self, content: &str) -> u32 {
1307 let mut max_depth = 0u32;
1308 let mut current_depth = 0i32;
1309
1310 for ch in content.chars() {
1311 match ch {
1312 '{' | '(' | '[' => {
1313 current_depth += 1;
1314 max_depth = max_depth.max(current_depth as u32);
1315 },
1316 '}' | ')' | ']' => {
1317 current_depth = (current_depth - 1).max(0);
1318 },
1319 _ => {},
1320 }
1321 }
1322
1323 max_depth
1324 }
1325
1326 fn calculate_indent_depth(&self, content: &str) -> u32 {
1329 let mut max_depth = 0u32;
1330 let mut base_indent: Option<usize> = None;
1331
1332 for line in content.lines() {
1333 let trimmed = line.trim();
1335 if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("--") {
1336 continue;
1337 }
1338
1339 let leading_spaces = line.len() - line.trim_start().len();
1341
1342 if base_indent.is_none() {
1344 base_indent = Some(leading_spaces);
1345 }
1346
1347 let base = base_indent.unwrap_or(0);
1349 if leading_spaces >= base {
1350 let relative_indent = leading_spaces - base;
1351 let depth = (relative_indent / 4).max(relative_indent / 2) as u32;
1353 max_depth = max_depth.max(depth + 1); }
1355 }
1356
1357 max_depth
1358 }
1359
1360 fn generate_tags(&self, symbol: &Symbol) -> Vec<String> {
1365 generate_tags_for_symbol(&symbol.name, symbol.signature.as_deref())
1366 }
1367
1368 fn compute_fqn(&self, file: &str, symbol: &Symbol) -> String {
1375 let module_path = file
1377 .strip_suffix(".rs")
1378 .or_else(|| file.strip_suffix(".py"))
1379 .or_else(|| file.strip_suffix(".ts"))
1380 .or_else(|| file.strip_suffix(".tsx"))
1381 .or_else(|| file.strip_suffix(".js"))
1382 .or_else(|| file.strip_suffix(".jsx"))
1383 .or_else(|| file.strip_suffix(".go"))
1384 .or_else(|| file.strip_suffix(".java"))
1385 .or_else(|| file.strip_suffix(".c"))
1386 .or_else(|| file.strip_suffix(".cpp"))
1387 .or_else(|| file.strip_suffix(".h"))
1388 .or_else(|| file.strip_suffix(".hpp"))
1389 .or_else(|| file.strip_suffix(".rb"))
1390 .or_else(|| file.strip_suffix(".php"))
1391 .or_else(|| file.strip_suffix(".cs"))
1392 .or_else(|| file.strip_suffix(".swift"))
1393 .or_else(|| file.strip_suffix(".kt"))
1394 .or_else(|| file.strip_suffix(".scala"))
1395 .unwrap_or(file)
1396 .replace(['\\', '/'], "::"); let symbol_part = if let Some(ref parent) = symbol.parent {
1400 format!("{}::{}::{}", module_path, parent, symbol.name)
1401 } else {
1402 format!("{}::{}", module_path, symbol.name)
1403 };
1404
1405 let repo_prefix = self.repo_id.qualified_name();
1407 if repo_prefix.is_empty() {
1408 symbol_part
1409 } else {
1410 format!("{}::{}", repo_prefix, symbol_part)
1411 }
1412 }
1413
1414 fn is_test_code(&self, path: &Path, symbol: &Symbol) -> bool {
1416 let path_str = path.to_string_lossy().to_lowercase();
1417
1418 if path_str.contains("test") || path_str.contains("spec") || path_str.contains("__tests__")
1420 {
1421 return true;
1422 }
1423
1424 let name = symbol.name.to_lowercase();
1426 if name.starts_with("test_") || name.ends_with("_test") || name.contains("_test_") {
1427 return true;
1428 }
1429
1430 false
1431 }
1432
1433 fn validate_repo_path(&self, path: &Path) -> Result<PathBuf, EmbedError> {
1435 let canonical = path
1436 .canonicalize()
1437 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
1438
1439 if !canonical.is_dir() {
1441 return Err(EmbedError::NotADirectory { path: path.to_path_buf() });
1442 }
1443
1444 Ok(canonical)
1445 }
1446
1447 fn safe_relative_path(&self, path: &Path, repo_root: &Path) -> Result<String, EmbedError> {
1449 let canonical = path
1450 .canonicalize()
1451 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
1452
1453 if !canonical.starts_with(repo_root) {
1455 return Err(EmbedError::PathTraversal {
1456 path: canonical,
1457 repo_root: repo_root.to_path_buf(),
1458 });
1459 }
1460
1461 Ok(canonical
1463 .strip_prefix(repo_root)
1464 .unwrap_or(&canonical)
1465 .to_string_lossy()
1466 .replace('\\', "/"))
1467 }
1468
1469 fn discover_files(&self, repo_root: &Path) -> Result<Vec<PathBuf>, EmbedError> {
1471 use glob::Pattern;
1472 use ignore::WalkBuilder;
1473
1474 let mut files = Vec::new();
1475
1476 let mut include_patterns = Vec::new();
1478 for pattern_str in &self.settings.include_patterns {
1479 match Pattern::new(pattern_str) {
1480 Ok(pattern) => include_patterns.push(pattern),
1481 Err(e) => {
1482 return Err(EmbedError::InvalidPattern {
1483 pattern: pattern_str.clone(),
1484 reason: e.to_string(),
1485 });
1486 },
1487 }
1488 }
1489
1490 let mut exclude_patterns = Vec::new();
1492 for pattern_str in &self.settings.exclude_patterns {
1493 match Pattern::new(pattern_str) {
1494 Ok(pattern) => exclude_patterns.push(pattern),
1495 Err(e) => {
1496 return Err(EmbedError::InvalidPattern {
1497 pattern: pattern_str.clone(),
1498 reason: e.to_string(),
1499 });
1500 },
1501 }
1502 }
1503
1504 let walker = WalkBuilder::new(repo_root)
1505 .hidden(false) .git_ignore(true) .git_global(true)
1508 .git_exclude(true)
1509 .follow_links(false) .build();
1511
1512 for entry in walker {
1513 let entry = entry.map_err(|e| EmbedError::IoError {
1514 path: repo_root.to_path_buf(),
1515 source: std::io::Error::other(e.to_string()),
1516 })?;
1517
1518 let path = entry.path();
1519
1520 if !path.is_file() {
1522 continue;
1523 }
1524
1525 let relative_path = path
1527 .strip_prefix(repo_root)
1528 .unwrap_or(path)
1529 .to_string_lossy();
1530
1531 if !include_patterns.is_empty()
1533 && !include_patterns.iter().any(|p| p.matches(&relative_path))
1534 {
1535 continue;
1536 }
1537
1538 if exclude_patterns.iter().any(|p| p.matches(&relative_path)) {
1540 continue;
1541 }
1542
1543 if !self.settings.include_tests && self.is_test_file(path) {
1545 continue;
1546 }
1547
1548 let ext = match path.extension().and_then(|e| e.to_str()) {
1550 Some(e) => e,
1551 None => continue,
1552 };
1553 if Language::from_extension(ext).is_none() {
1554 continue;
1555 }
1556
1557 files.push(path.to_path_buf());
1558 }
1559
1560 Ok(files)
1561 }
1562
1563 fn is_test_file(&self, path: &Path) -> bool {
1565 let path_str = path.to_string_lossy().to_lowercase();
1566
1567 if path_str.contains("/tests/")
1569 || path_str.contains("\\tests\\")
1570 || path_str.contains("/test/")
1571 || path_str.contains("\\test\\")
1572 || path_str.contains("/__tests__/")
1573 || path_str.contains("\\__tests__\\")
1574 || path_str.contains("/spec/")
1575 || path_str.contains("\\spec\\")
1576 {
1577 return true;
1578 }
1579
1580 let filename = path
1582 .file_name()
1583 .and_then(|n| n.to_str())
1584 .unwrap_or("")
1585 .to_lowercase();
1586
1587 filename.starts_with("test_")
1588 || filename.ends_with("_test.rs")
1589 || filename.ends_with("_test.py")
1590 || filename.ends_with("_test.go")
1591 || filename.ends_with(".test.ts")
1592 || filename.ends_with(".test.js")
1593 || filename.ends_with(".test.tsx")
1594 || filename.ends_with(".test.jsx")
1595 || filename.ends_with(".spec.ts")
1596 || filename.ends_with(".spec.js")
1597 || filename.ends_with("_spec.rb")
1598 }
1599
1600 fn detect_language(&self, path: &Path) -> String {
1602 path.extension()
1603 .and_then(|e| e.to_str())
1604 .and_then(Language::from_extension)
1605 .map_or_else(|| "unknown".to_owned(), |l| l.display_name().to_owned())
1606 }
1607
1608 fn detect_language_enum(&self, path: &Path) -> Option<Language> {
1610 path.extension()
1611 .and_then(|e| e.to_str())
1612 .and_then(Language::from_extension)
1613 }
1614
1615 fn parse_token_model(&self, model: &str) -> TokenModel {
1617 TokenModel::from_model_name(model).unwrap_or(TokenModel::Claude)
1618 }
1619}
1620
1621pub(crate) fn extract_keywords(content: &str) -> Vec<String> {
1627 use std::collections::BTreeMap;
1628
1629 const STOPWORDS: &[&str] = &[
1630 "the", "a", "an", "and", "or", "not", "is", "are", "was", "were", "be", "been", "being",
1631 "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may",
1632 "might", "shall", "can", "need", "must", "let", "var", "const", "mut", "pub", "fn", "def",
1633 "class", "struct", "enum", "impl", "trait", "use", "import", "from", "return", "if",
1634 "else", "for", "while", "loop", "match", "true", "false", "none", "null", "self", "this",
1635 "super", "new", "type", "static", "async", "await", "try", "catch", "throw", "throws",
1636 "void", "int", "str", "string", "bool", "float", "double", "char", "byte",
1637 ];
1638
1639 let mut freq: BTreeMap<String, usize> = BTreeMap::new();
1640
1641 for token in content.split(|c: char| !c.is_alphanumeric() && c != '_') {
1642 let sub_tokens = split_identifier(token);
1643 for sub in &sub_tokens {
1644 let lower = sub.to_lowercase();
1645 if lower.len() >= 3 && !STOPWORDS.contains(&lower.as_str()) {
1646 *freq.entry(lower).or_insert(0) += 1;
1647 }
1648 }
1649 }
1650
1651 let mut entries: Vec<(String, usize)> = freq.into_iter().collect();
1652 entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
1653 entries.into_iter().take(10).map(|(word, _)| word).collect()
1654}
1655
1656pub(crate) fn generate_context_prefix(
1662 file_path: &str,
1663 parent: Option<&str>,
1664 kind: &crate::types::SymbolKind,
1665) -> String {
1666 let kind_name = match kind {
1667 crate::types::SymbolKind::Function => "function",
1668 crate::types::SymbolKind::Method => "method",
1669 crate::types::SymbolKind::Class => "class",
1670 crate::types::SymbolKind::Struct => "struct",
1671 crate::types::SymbolKind::Enum => "enum",
1672 crate::types::SymbolKind::Interface => "interface",
1673 crate::types::SymbolKind::Trait => "trait",
1674 crate::types::SymbolKind::Import => "import",
1675 crate::types::SymbolKind::Constant => "constant",
1676 crate::types::SymbolKind::Variable => "variable",
1677 crate::types::SymbolKind::TypeAlias => "type",
1678 crate::types::SymbolKind::Export => "export",
1679 crate::types::SymbolKind::Module => "module",
1680 crate::types::SymbolKind::Macro => "macro",
1681 };
1682
1683 match parent {
1684 Some(p) => format!("From {file_path}, in {p}, {kind_name}"),
1685 None => format!("From {file_path}, {kind_name}"),
1686 }
1687}
1688
1689pub(crate) fn generate_tags_for_symbol(name: &str, sig: Option<&str>) -> Vec<String> {
1693 let mut tags = Vec::new();
1694 let signature = sig.unwrap_or("");
1695 let name_lower = name.to_lowercase();
1696
1697 if signature.contains("async") || signature.contains("await") || signature.contains("suspend") {
1698 tags.push("async".to_owned());
1699 }
1700 if name_lower.contains("thread")
1701 || name_lower.contains("mutex")
1702 || name_lower.contains("lock")
1703 || name_lower.contains("spawn")
1704 || name_lower.contains("parallel")
1705 || name_lower.contains("goroutine")
1706 || name_lower.contains("channel")
1707 || signature.contains("Mutex")
1708 || signature.contains("RwLock")
1709 || signature.contains("Arc")
1710 || signature.contains("chan ")
1711 || signature.contains("<-chan")
1712 || signature.contains("chan<-")
1713 || signature.contains("sync.")
1714 || signature.contains("WaitGroup")
1715 {
1716 tags.push("concurrency".to_owned());
1717 }
1718 if name_lower.contains("password")
1719 || name_lower.contains("token")
1720 || name_lower.contains("secret")
1721 || name_lower.contains("auth")
1722 || name_lower.contains("crypt")
1723 || name_lower.contains("hash")
1724 || name_lower.contains("permission")
1725 || signature.contains("password")
1726 || signature.contains("token")
1727 || signature.contains("secret")
1728 {
1729 tags.push("security".to_owned());
1730 }
1731 if signature.contains("Error")
1732 || signature.contains("Result")
1733 || name_lower.contains("error")
1734 || name_lower.contains("exception")
1735 || name_lower.contains("panic")
1736 || name_lower.contains("unwrap")
1737 {
1738 tags.push("error-handling".to_owned());
1739 }
1740 if name_lower.contains("query")
1741 || name_lower.contains("sql")
1742 || name_lower.contains("database")
1743 || name_lower.contains("db_")
1744 || name_lower.starts_with("db")
1745 || name_lower.contains("repository")
1746 || name_lower.contains("transaction")
1747 {
1748 tags.push("database".to_owned());
1749 }
1750 if name_lower.contains("http")
1751 || name_lower.contains("request")
1752 || name_lower.contains("response")
1753 || name_lower.contains("endpoint")
1754 || name_lower.contains("route")
1755 || name_lower.contains("handler")
1756 || name_lower.contains("middleware")
1757 {
1758 tags.push("http".to_owned());
1759 }
1760 if name_lower.contains("command")
1761 || name_lower.contains("cli")
1762 || name_lower.contains("arg")
1763 || name_lower.contains("flag")
1764 || name_lower.contains("option")
1765 || name_lower.contains("subcommand")
1766 {
1767 tags.push("cli".to_owned());
1768 }
1769 if name_lower.contains("config")
1770 || name_lower.contains("setting")
1771 || name_lower.contains("preference")
1772 || name_lower.contains("option")
1773 || name_lower.contains("env")
1774 {
1775 tags.push("config".to_owned());
1776 }
1777 if name_lower.contains("log")
1778 || name_lower.contains("trace")
1779 || name_lower.contains("debug")
1780 || name_lower.contains("warn")
1781 || name_lower.contains("info")
1782 || name_lower.contains("metric")
1783 {
1784 tags.push("logging".to_owned());
1785 }
1786 if name_lower.contains("cache")
1787 || name_lower.contains("memoize")
1788 || name_lower.contains("invalidate")
1789 {
1790 tags.push("cache".to_owned());
1791 }
1792 if name_lower.contains("valid")
1793 || name_lower.contains("check")
1794 || name_lower.contains("verify")
1795 || name_lower.contains("assert")
1796 || name_lower.contains("sanitize")
1797 {
1798 tags.push("validation".to_owned());
1799 }
1800 if name_lower.contains("serial")
1801 || name_lower.contains("deserial")
1802 || name_lower.contains("json")
1803 || name_lower.contains("xml")
1804 || name_lower.contains("yaml")
1805 || name_lower.contains("toml")
1806 || name_lower.contains("encode")
1807 || name_lower.contains("decode")
1808 || name_lower.contains("parse")
1809 || name_lower.contains("format")
1810 {
1811 tags.push("serialization".to_owned());
1812 }
1813 if name_lower.contains("file")
1814 || name_lower.contains("read")
1815 || name_lower.contains("write")
1816 || name_lower.contains("path")
1817 || name_lower.contains("dir")
1818 || name_lower.contains("fs")
1819 || name_lower.contains("io")
1820 {
1821 tags.push("io".to_owned());
1822 }
1823 if name_lower.contains("socket")
1824 || name_lower.contains("connect")
1825 || name_lower.contains("network")
1826 || name_lower.contains("tcp")
1827 || name_lower.contains("udp")
1828 || name_lower.contains("client")
1829 || name_lower.contains("server")
1830 {
1831 tags.push("network".to_owned());
1832 }
1833 if name_lower == "new"
1834 || name_lower == "init"
1835 || name_lower == "setup"
1836 || name_lower == "create"
1837 || name_lower.starts_with("new_")
1838 || name_lower.starts_with("init_")
1839 || name_lower.starts_with("create_")
1840 || name_lower.ends_with("_new")
1841 {
1842 tags.push("init".to_owned());
1843 }
1844 if name_lower.contains("cleanup")
1845 || name_lower.contains("teardown")
1846 || name_lower.contains("close")
1847 || name_lower.contains("dispose")
1848 || name_lower.contains("shutdown")
1849 || name_lower == "drop"
1850 {
1851 tags.push("cleanup".to_owned());
1852 }
1853 if name.starts_with("test_")
1854 || name.ends_with("_test")
1855 || name.contains("Test")
1856 || name_lower.contains("mock")
1857 || name_lower.contains("stub")
1858 || name_lower.contains("fixture")
1859 {
1860 tags.push("test".to_owned());
1861 }
1862 if signature.contains("deprecated") || signature.contains("Deprecated") {
1863 tags.push("deprecated".to_owned());
1864 }
1865 if signature.starts_with("pub fn")
1866 || signature.starts_with("pub async fn")
1867 || signature.starts_with("export")
1868 {
1869 tags.push("public-api".to_owned());
1870 }
1871 if name_lower.contains("model")
1872 || name_lower.contains("train")
1873 || name_lower.contains("predict")
1874 || name_lower.contains("inference")
1875 || name_lower.contains("neural")
1876 || name_lower.contains("embedding")
1877 || name_lower.contains("classifier")
1878 || name_lower.contains("regressor")
1879 || name_lower.contains("optimizer")
1880 || name_lower.contains("loss")
1881 || name_lower.contains("gradient")
1882 || name_lower.contains("backprop")
1883 || name_lower.contains("forward")
1884 || name_lower.contains("layer")
1885 || name_lower.contains("activation")
1886 || name_lower.contains("weight")
1887 || name_lower.contains("bias")
1888 || name_lower.contains("epoch")
1889 || name_lower.contains("batch")
1890 || signature.contains("torch")
1891 || signature.contains("tensorflow")
1892 || signature.contains("keras")
1893 || signature.contains("sklearn")
1894 || signature.contains("nn.")
1895 || signature.contains("nn::")
1896 {
1897 tags.push("ml".to_owned());
1898 }
1899 if name_lower.contains("dataframe")
1900 || name_lower.contains("dataset")
1901 || name_lower.contains("tensor")
1902 || name_lower.contains("numpy")
1903 || name_lower.contains("pandas")
1904 || name_lower.contains("array")
1905 || name_lower.contains("matrix")
1906 || name_lower.contains("vector")
1907 || name_lower.contains("feature")
1908 || name_lower.contains("preprocess")
1909 || name_lower.contains("normalize")
1910 || name_lower.contains("transform")
1911 || name_lower.contains("pipeline")
1912 || name_lower.contains("etl")
1913 || name_lower.contains("aggregate")
1914 || name_lower.contains("groupby")
1915 || name_lower.contains("pivot")
1916 || signature.contains("pd.")
1917 || signature.contains("np.")
1918 || signature.contains("DataFrame")
1919 || signature.contains("ndarray")
1920 {
1921 tags.push("data-science".to_owned());
1922 }
1923
1924 tags
1925}
1926
1927pub(crate) fn generate_summary(
1937 kind: ChunkKind,
1938 source: &ChunkSource,
1939 context: &ChunkContext,
1940) -> Option<String> {
1941 if kind == ChunkKind::Imports {
1943 return None;
1944 }
1945
1946 if let Some(ref docstring) = context.docstring {
1948 let cleaned = strip_doc_markers(docstring);
1949 if !cleaned.is_empty() && cleaned.len() <= 400 {
1950 return Some(cleaned);
1951 }
1952 if !cleaned.is_empty() {
1954 let first_line = extract_first_sentence(&cleaned);
1955 if !first_line.is_empty() {
1956 return Some(first_line);
1957 }
1958 }
1959 }
1960
1961 let file_module = file_path_to_module(&source.file);
1963
1964 match kind {
1965 ChunkKind::TopLevel => {
1966 return Some(format!("Top-level code in {}", source.file));
1967 },
1968 ChunkKind::Imports => return None,
1969 _ => {},
1970 }
1971
1972 let visibility_prefix = format_visibility(source.visibility);
1973 let kind_label = kind.name();
1974 let symbol = &source.symbol;
1975
1976 match kind {
1977 ChunkKind::Function | ChunkKind::Method | ChunkKind::FunctionPart => {
1978 let sig_part = context
1979 .signature
1980 .as_deref()
1981 .map(|s| format!(" -- {}", truncate_signature(s, 200)))
1982 .unwrap_or_default();
1983 Some(format!(
1984 "{}{} '{}' in {}{}",
1985 visibility_prefix, kind_label, symbol, file_module, sig_part
1986 ))
1987 },
1988 ChunkKind::Class | ChunkKind::Struct | ChunkKind::ClassPart => {
1989 Some(format!("{}{} '{}' in {}", visibility_prefix, kind_label, symbol, file_module))
1990 },
1991 ChunkKind::Enum => {
1992 Some(format!("{}enum '{}' in {}", visibility_prefix, symbol, file_module))
1993 },
1994 ChunkKind::Interface | ChunkKind::Trait => {
1995 Some(format!("{}{} '{}' in {}", visibility_prefix, kind_label, symbol, file_module))
1996 },
1997 ChunkKind::Constant | ChunkKind::Variable => {
1998 Some(format!("{}{} '{}' in {}", visibility_prefix, kind_label, symbol, file_module))
1999 },
2000 ChunkKind::Module => {
2001 Some(format!("{}module '{}' in {}", visibility_prefix, symbol, file_module))
2002 },
2003 _ => None,
2004 }
2005}
2006
2007fn strip_doc_markers(docstring: &str) -> String {
2011 let first_line = docstring.lines().next().unwrap_or("");
2012 let trimmed = first_line.trim();
2013
2014 let stripped = trimmed
2016 .strip_prefix("///")
2017 .or_else(|| trimmed.strip_prefix("//!"))
2018 .or_else(|| trimmed.strip_prefix("/**"))
2019 .or_else(|| trimmed.strip_prefix("/*"))
2020 .or_else(|| trimmed.strip_prefix("*/"))
2021 .or_else(|| trimmed.strip_prefix("* "))
2022 .or_else(|| trimmed.strip_prefix('*'))
2023 .or_else(|| trimmed.strip_prefix("\"\"\""))
2024 .or_else(|| trimmed.strip_prefix("'''"))
2025 .or_else(|| trimmed.strip_prefix("# "))
2026 .or_else(|| trimmed.strip_prefix('#'))
2027 .unwrap_or(trimmed);
2028
2029 let stripped = stripped.trim();
2031 let stripped = stripped
2032 .strip_suffix("\"\"\"")
2033 .or_else(|| stripped.strip_suffix("'''"))
2034 .or_else(|| stripped.strip_suffix("*/"))
2035 .unwrap_or(stripped);
2036
2037 stripped.trim().to_owned()
2038}
2039
2040fn extract_first_sentence(text: &str) -> String {
2045 let first_line = text.lines().next().unwrap_or(text);
2047
2048 let mut end = first_line.len();
2050 for (i, ch) in first_line.char_indices() {
2051 if matches!(ch, '.' | '!' | '?') {
2052 let next_idx = i + ch.len_utf8();
2054 if next_idx >= first_line.len()
2055 || first_line[next_idx..].starts_with(char::is_whitespace)
2056 {
2057 end = next_idx;
2058 break;
2059 }
2060 }
2061 }
2062
2063 let result = first_line[..end].trim();
2064 if result.len() > 400 {
2065 format!("{}...", &result[..397])
2066 } else {
2067 result.to_owned()
2068 }
2069}
2070
2071fn file_path_to_module(file_path: &str) -> String {
2078 let path = file_path.replace('\\', "/");
2079
2080 let stripped = path
2082 .strip_prefix("src/")
2083 .or_else(|| path.strip_prefix("lib/"))
2084 .or_else(|| path.strip_prefix("main/"))
2085 .unwrap_or(&path);
2086
2087 let without_ext = stripped.rsplit_once('.').map_or(stripped, |(base, _)| base);
2089
2090 without_ext.replace('/', "::")
2092}
2093
2094fn format_visibility(vis: Visibility) -> &'static str {
2099 match vis {
2100 Visibility::Public => "Public ",
2101 Visibility::Private => "Private ",
2102 Visibility::Protected => "Protected ",
2103 Visibility::Internal => "Internal ",
2104 }
2105}
2106
2107fn truncate_signature(sig: &str, max_len: usize) -> String {
2112 let oneliner: String = sig
2114 .lines()
2115 .map(|l| l.trim())
2116 .filter(|l| !l.is_empty())
2117 .collect::<Vec<_>>()
2118 .join(" ");
2119
2120 if oneliner.len() <= max_len {
2121 oneliner
2122 } else {
2123 format!("{}...", &oneliner[..max_len.saturating_sub(3)])
2124 }
2125}
2126
2127pub(crate) fn split_identifier(ident: &str) -> Vec<String> {
2134 let mut tokens = Vec::new();
2135 let mut current = String::new();
2136
2137 for ch in ident.chars() {
2138 if ch == '_' {
2139 if !current.is_empty() {
2140 tokens.push(std::mem::take(&mut current));
2141 }
2142 } else if ch.is_uppercase() && !current.is_empty() {
2143 let last_was_upper = current.chars().last().is_some_and(|c| c.is_uppercase());
2144 if !last_was_upper {
2145 tokens.push(std::mem::take(&mut current));
2147 }
2148 current.push(ch);
2149 } else {
2150 if ch.is_lowercase() && current.len() > 1 && current.chars().all(|c| c.is_uppercase()) {
2152 let last = current.pop().unwrap();
2153 if !current.is_empty() {
2154 tokens.push(std::mem::take(&mut current));
2155 }
2156 current.push(last);
2157 }
2158 current.push(ch);
2159 }
2160 }
2161
2162 if !current.is_empty() {
2163 tokens.push(current);
2164 }
2165
2166 tokens
2167}
2168
2169pub(crate) fn derive_module_path(file_path: &str, language: &str) -> String {
2179 let path = file_path.replace('\\', "/");
2180 let lang_lower = language.to_lowercase();
2181
2182 match lang_lower.as_str() {
2183 "rust" => derive_module_path_rust(&path),
2184 "python" => derive_module_path_python(&path),
2185 "typescript" | "tsx" | "javascript" | "jsx" => derive_module_path_js(&path),
2186 "java" => derive_module_path_java(&path),
2187 "go" => derive_module_path_go(&path),
2188 _ => derive_module_path_default(&path),
2189 }
2190}
2191
2192fn derive_module_path_rust(path: &str) -> String {
2193 let mut p = path.to_owned();
2194
2195 if let Some(rest) = p.strip_prefix("src/") {
2197 p = rest.to_owned();
2198 }
2199
2200 if p == "lib.rs" || p == "main.rs" {
2202 return String::new(); }
2204
2205 if let Some(rest) = p.strip_suffix("/mod.rs") {
2207 p = rest.to_owned();
2208 } else if let Some(rest) = p.strip_suffix(".rs") {
2209 p = rest.to_owned();
2210 }
2211
2212 p.replace('/', "::")
2213}
2214
2215fn derive_module_path_python(path: &str) -> String {
2216 let mut p = path.to_owned();
2217
2218 for prefix in &["src/", "lib/"] {
2220 if let Some(rest) = p.strip_prefix(prefix) {
2221 p = rest.to_owned();
2222 break;
2223 }
2224 }
2225
2226 if let Some(rest) = p.strip_suffix("/__init__.py") {
2228 return rest.replace('/', ".");
2229 }
2230 if p == "__init__.py" {
2231 return String::new();
2232 }
2233
2234 if let Some(rest) = p.strip_suffix(".py") {
2236 p = rest.to_owned();
2237 }
2238
2239 p.replace('/', ".")
2240}
2241
2242fn derive_module_path_js(path: &str) -> String {
2243 let mut p = path.to_owned();
2244
2245 for prefix in &["src/", "lib/"] {
2247 if let Some(rest) = p.strip_prefix(prefix) {
2248 p = rest.to_owned();
2249 break;
2250 }
2251 }
2252
2253 let index_suffixes = ["/index.ts", "/index.tsx", "/index.js", "/index.jsx"];
2255 for suffix in &index_suffixes {
2256 if let Some(rest) = p.strip_suffix(suffix) {
2257 return rest.to_owned();
2258 }
2259 }
2260 if p.starts_with("index.") {
2261 return String::new();
2262 }
2263
2264 for ext in &[".ts", ".tsx", ".js", ".jsx"] {
2266 if let Some(rest) = p.strip_suffix(ext) {
2267 return rest.to_owned();
2268 }
2269 }
2270
2271 p
2272}
2273
2274fn derive_module_path_java(path: &str) -> String {
2275 let mut p = path.to_owned();
2276
2277 for prefix in &["src/main/java/", "src/test/java/", "src/"] {
2279 if let Some(rest) = p.strip_prefix(prefix) {
2280 p = rest.to_owned();
2281 break;
2282 }
2283 }
2284
2285 if let Some(rest) = p.strip_suffix(".java") {
2287 p = rest.to_owned();
2288 }
2289
2290 if let Some(last_slash) = p.rfind('/') {
2292 p = p[..last_slash].to_owned();
2293 } else {
2294 return String::new(); }
2296
2297 p.replace('/', ".")
2298}
2299
2300fn derive_module_path_go(path: &str) -> String {
2301 let mut p = path.to_owned();
2302
2303 for prefix in &["internal/", "pkg/", "cmd/"] {
2305 if let Some(rest) = p.strip_prefix(prefix) {
2306 p = rest.to_owned();
2307 break;
2308 }
2309 }
2310
2311 if let Some(rest) = p.strip_suffix(".go") {
2313 p = rest.to_owned();
2314 }
2315
2316 if let Some(last_slash) = p.rfind('/') {
2318 p[..last_slash].to_owned()
2319 } else {
2320 p
2322 }
2323}
2324
2325fn derive_module_path_default(path: &str) -> String {
2326 let mut p = path.to_owned();
2327
2328 for prefix in &["src/", "lib/"] {
2330 if let Some(rest) = p.strip_prefix(prefix) {
2331 p = rest.to_owned();
2332 break;
2333 }
2334 }
2335
2336 if let Some(dot_pos) = p.rfind('.') {
2338 p = p[..dot_pos].to_owned();
2339 }
2340
2341 p.replace('/', "::")
2342}
2343
2344#[cfg(test)]
2345mod tests {
2346 use super::*;
2347 use crate::embedding::progress::QuietProgress;
2348 use tempfile::TempDir;
2349
2350 fn create_test_file(dir: &Path, name: &str, content: &str) {
2351 let path = dir.join(name);
2352 if let Some(parent) = path.parent() {
2353 std::fs::create_dir_all(parent).unwrap();
2354 }
2355 std::fs::write(path, content).unwrap();
2356 }
2357
2358 #[test]
2359 fn test_chunker_creation() {
2360 let settings = EmbedSettings::default();
2361 let limits = ResourceLimits::default();
2362 let chunker = EmbedChunker::new(settings, limits);
2363 assert!(chunker.settings.max_tokens > 0);
2364 }
2365
2366 #[test]
2367 fn test_chunk_single_file() {
2368 let temp_dir = TempDir::new().unwrap();
2369 let rust_code = r#"
2370/// A test function
2371fn hello() {
2372 println!("Hello, world!");
2373}
2374
2375fn goodbye() {
2376 println!("Goodbye!");
2377}
2378"#;
2379 create_test_file(temp_dir.path(), "test.rs", rust_code);
2380
2381 let settings = EmbedSettings::default();
2382 let mut chunker = EmbedChunker::with_defaults(settings);
2383 let progress = QuietProgress;
2384
2385 let chunks = chunker
2386 .chunk_repository(temp_dir.path(), &progress)
2387 .unwrap();
2388
2389 assert!(!chunks.is_empty());
2391
2392 for i in 1..chunks.len() {
2394 assert!(chunks[i - 1].source.file <= chunks[i].source.file);
2395 }
2396 }
2397
2398 #[test]
2399 fn test_determinism() {
2400 let temp_dir = TempDir::new().unwrap();
2401 create_test_file(temp_dir.path(), "a.rs", "fn foo() {}");
2402 create_test_file(temp_dir.path(), "b.rs", "fn bar() {}");
2403
2404 let settings = EmbedSettings::default();
2405 let progress = QuietProgress;
2406
2407 let results: Vec<Vec<EmbedChunk>> = (0..3)
2408 .map(|_| {
2409 let mut chunker = EmbedChunker::with_defaults(settings.clone());
2410 chunker
2411 .chunk_repository(temp_dir.path(), &progress)
2412 .unwrap()
2413 })
2414 .collect();
2415
2416 for i in 1..results.len() {
2418 assert_eq!(results[0].len(), results[i].len());
2419 for j in 0..results[0].len() {
2420 assert_eq!(results[0][j].id, results[i][j].id);
2421 }
2422 }
2423 }
2424
2425 #[test]
2426 fn test_file_too_large() {
2427 let temp_dir = TempDir::new().unwrap();
2428 let large_content = "x".repeat(200);
2430 create_test_file(temp_dir.path(), "large.rs", &large_content);
2431
2432 let settings = EmbedSettings::default();
2433 let limits = ResourceLimits::default().with_max_file_size(100);
2434 let mut chunker = EmbedChunker::new(settings, limits);
2435 let progress = QuietProgress;
2436
2437 let result = chunker.chunk_repository(temp_dir.path(), &progress);
2439
2440 assert!(result.is_err());
2443 }
2444
2445 #[test]
2446 fn test_empty_directory() {
2447 let temp_dir = TempDir::new().unwrap();
2448
2449 let settings = EmbedSettings::default();
2450 let mut chunker = EmbedChunker::with_defaults(settings);
2451 let progress = QuietProgress;
2452
2453 let result = chunker.chunk_repository(temp_dir.path(), &progress);
2454
2455 assert!(matches!(result, Err(EmbedError::NoChunksGenerated { .. })));
2456 }
2457
2458 #[test]
2459 fn test_language_detection() {
2460 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2461
2462 assert_eq!(chunker.detect_language(Path::new("test.rs")), "Rust");
2463 assert_eq!(chunker.detect_language(Path::new("test.py")), "Python");
2464 assert_eq!(chunker.detect_language(Path::new("test.unknown")), "unknown");
2465 }
2466
2467 #[test]
2468 fn test_is_test_code() {
2469 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2470
2471 let test_symbol = Symbol::new("test_foo", crate::types::SymbolKind::Function);
2472 assert!(chunker.is_test_code(Path::new("foo.rs"), &test_symbol));
2473
2474 let normal_symbol = Symbol::new("foo", crate::types::SymbolKind::Function);
2475 assert!(!chunker.is_test_code(Path::new("src/lib.rs"), &normal_symbol));
2476
2477 assert!(chunker.is_test_code(Path::new("tests/test_foo.rs"), &normal_symbol));
2479 }
2480
2481 #[test]
2482 fn test_generate_tags() {
2483 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2484
2485 let mut symbol = Symbol::new("authenticate_user", crate::types::SymbolKind::Function);
2486 symbol.signature = Some("async fn authenticate_user(password: &str)".to_owned());
2487
2488 let tags = chunker.generate_tags(&symbol);
2489 assert!(tags.contains(&"async".to_owned()));
2490 assert!(tags.contains(&"security".to_owned()));
2491 }
2492
2493 #[test]
2494 fn test_generate_tags_kotlin_suspend() {
2495 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2496
2497 let mut symbol = Symbol::new("fetchData", crate::types::SymbolKind::Function);
2498 symbol.signature = Some("suspend fun fetchData(): Result<Data>".to_owned());
2499
2500 let tags = chunker.generate_tags(&symbol);
2501 assert!(tags.contains(&"async".to_owned()), "Kotlin suspend should be tagged as async");
2502 }
2503
2504 #[test]
2505 fn test_generate_tags_go_concurrency() {
2506 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2507
2508 let mut symbol = Symbol::new("processMessages", crate::types::SymbolKind::Function);
2509 symbol.signature = Some("func processMessages(ch chan string)".to_owned());
2510
2511 let tags = chunker.generate_tags(&symbol);
2512 assert!(
2513 tags.contains(&"concurrency".to_owned()),
2514 "Go channels should be tagged as concurrency"
2515 );
2516 }
2517
2518 #[test]
2519 fn test_generate_tags_ml() {
2520 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2521
2522 let mut symbol = Symbol::new("train_model", crate::types::SymbolKind::Function);
2524 symbol.signature = Some("def train_model(epochs: int, batch_size: int)".to_owned());
2525 let tags = chunker.generate_tags(&symbol);
2526 assert!(tags.contains(&"ml".to_owned()), "train_model should be tagged as ml");
2527
2528 let mut symbol2 = Symbol::new("forward_pass", crate::types::SymbolKind::Function);
2530 symbol2.signature = Some("def forward_pass(self, x: torch.Tensor)".to_owned());
2531 let tags2 = chunker.generate_tags(&symbol2);
2532 assert!(
2533 tags2.contains(&"ml".to_owned()),
2534 "torch.Tensor in signature should be tagged as ml"
2535 );
2536
2537 let mut symbol3 = Symbol::new("ImageClassifier", crate::types::SymbolKind::Class);
2539 symbol3.signature = Some("class ImageClassifier(nn.Module)".to_owned());
2540 let tags3 = chunker.generate_tags(&symbol3);
2541 assert!(tags3.contains(&"ml".to_owned()), "nn.Module should be tagged as ml");
2542 }
2543
2544 #[test]
2545 fn test_generate_tags_data_science() {
2546 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2547
2548 let mut symbol = Symbol::new("preprocess_dataframe", crate::types::SymbolKind::Function);
2550 symbol.signature = Some("def preprocess_dataframe(df: pd.DataFrame)".to_owned());
2551 let tags = chunker.generate_tags(&symbol);
2552 assert!(
2553 tags.contains(&"data-science".to_owned()),
2554 "DataFrame should be tagged as data-science"
2555 );
2556
2557 let mut symbol2 = Symbol::new("normalize_array", crate::types::SymbolKind::Function);
2559 symbol2.signature = Some("def normalize_array(arr: np.ndarray)".to_owned());
2560 let tags2 = chunker.generate_tags(&symbol2);
2561 assert!(
2562 tags2.contains(&"data-science".to_owned()),
2563 "np.ndarray should be tagged as data-science"
2564 );
2565
2566 let symbol3 = Symbol::new("run_etl_pipeline", crate::types::SymbolKind::Function);
2568 let tags3 = chunker.generate_tags(&symbol3);
2569 assert!(tags3.contains(&"data-science".to_owned()), "etl should be tagged as data-science");
2570 }
2571
2572 #[test]
2573 fn test_brace_nesting_depth() {
2574 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2575
2576 let code = "fn foo() { if x { if y { } } }";
2578 assert_eq!(chunker.calculate_brace_depth(code), 3);
2579
2580 let flat = "let x = 1;";
2582 assert_eq!(chunker.calculate_brace_depth(flat), 0);
2583
2584 let deep = "fn f() { let a = vec![HashMap::new()]; }";
2586 assert!(chunker.calculate_brace_depth(deep) >= 2);
2587 }
2588
2589 #[test]
2590 fn test_indent_nesting_depth() {
2591 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2592
2593 let python_code = r#"
2595def foo():
2596 if x:
2597 if y:
2598 do_something()
2599 else:
2600 other()
2601"#;
2602 let depth = chunker.calculate_indent_depth(python_code);
2603 assert!(depth >= 3, "Should detect indentation nesting, got {}", depth);
2604
2605 let flat = "x = 1\ny = 2\n";
2607 assert!(chunker.calculate_indent_depth(flat) <= 1);
2608 }
2609
2610 #[test]
2611 fn test_combined_nesting_depth() {
2612 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2613
2614 let rust_code = "fn foo() { if x { match y { A => {}, B => {} } } }";
2616 let depth = chunker.calculate_nesting_depth(rust_code);
2617 assert!(depth >= 3, "Should use brace depth for Rust-like code");
2618
2619 let python_code = "def foo():\n if x:\n y()\n";
2621 let depth = chunker.calculate_nesting_depth(python_code);
2622 assert!(depth >= 1, "Should use indent depth for Python-like code");
2623 }
2624
2625 #[test]
2626 fn test_lines_of_code() {
2627 let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2628
2629 let code = r#"
2630// This is a comment
2631fn foo() {
2632 let x = 1;
2633
2634 // Another comment
2635 let y = 2;
2636}
2637"#;
2638 let loc = chunker.count_lines_of_code(code);
2639 assert!((4..=5).contains(&loc), "LOC should be ~4, got {}", loc);
2642 }
2643
2644 #[test]
2645 fn test_line_too_long_error() {
2646 let temp_dir = TempDir::new().unwrap();
2647
2648 let long_line = "x".repeat(50_000);
2650 let content = format!("fn foo() {{ {} }}", long_line);
2651 create_test_file(temp_dir.path(), "minified.rs", &content);
2652
2653 let settings = EmbedSettings::default();
2654 let limits = ResourceLimits::default().with_max_line_length(10_000);
2656 let mut chunker = EmbedChunker::new(settings, limits);
2657 let progress = QuietProgress;
2658
2659 let result = chunker.chunk_repository(temp_dir.path(), &progress);
2660
2661 assert!(result.is_err(), "Should reject files with very long lines");
2663 }
2664
2665 #[test]
2666 fn test_hierarchical_chunking_integration() {
2667 let temp_dir = TempDir::new().unwrap();
2668
2669 let rust_code = r#"
2671/// A user account
2672pub struct User {
2673 pub name: String,
2674 pub email: String,
2675}
2676
2677impl User {
2678 /// Create a new user
2679 pub fn new(name: String, email: String) -> Self {
2680 Self { name, email }
2681 }
2682
2683 /// Get the user's display name
2684 pub fn display_name(&self) -> &str {
2685 &self.name
2686 }
2687
2688 /// Validate the user's email
2689 pub fn validate_email(&self) -> bool {
2690 self.email.contains('@')
2691 }
2692}
2693"#;
2694 create_test_file(temp_dir.path(), "user.rs", rust_code);
2695
2696 let settings_no_hierarchy = EmbedSettings { enable_hierarchy: false, ..Default::default() };
2698 let mut chunker_no_hierarchy = EmbedChunker::with_defaults(settings_no_hierarchy);
2699 let progress = QuietProgress;
2700 let chunks_no_hierarchy = chunker_no_hierarchy
2701 .chunk_repository(temp_dir.path(), &progress)
2702 .unwrap();
2703
2704 let settings_with_hierarchy = EmbedSettings {
2706 enable_hierarchy: true,
2707 hierarchy_min_children: 2,
2708 ..Default::default()
2709 };
2710 let mut chunker_with_hierarchy = EmbedChunker::with_defaults(settings_with_hierarchy);
2711 let chunks_with_hierarchy = chunker_with_hierarchy
2712 .chunk_repository(temp_dir.path(), &progress)
2713 .unwrap();
2714
2715 assert!(
2717 chunks_with_hierarchy.len() >= chunks_no_hierarchy.len(),
2718 "Hierarchy should produce at least as many chunks: {} vs {}",
2719 chunks_with_hierarchy.len(),
2720 chunks_no_hierarchy.len()
2721 );
2722
2723 let summary_chunks: Vec<_> = chunks_with_hierarchy
2725 .iter()
2726 .filter(|c| matches!(c.kind, ChunkKind::Module)) .collect();
2728
2729 if !summary_chunks.is_empty() {
2732 for summary in &summary_chunks {
2734 assert!(!summary.content.is_empty(), "Summary chunk should have content");
2735 }
2736 }
2737
2738 let chunks_with_hierarchy_2 = chunker_with_hierarchy
2740 .chunk_repository(temp_dir.path(), &progress)
2741 .unwrap();
2742 assert_eq!(
2743 chunks_with_hierarchy.len(),
2744 chunks_with_hierarchy_2.len(),
2745 "Hierarchical chunking should be deterministic"
2746 );
2747 for (c1, c2) in chunks_with_hierarchy
2748 .iter()
2749 .zip(chunks_with_hierarchy_2.iter())
2750 {
2751 assert_eq!(c1.id, c2.id, "Chunk IDs should be identical across runs");
2752 }
2753 }
2754
2755 #[test]
2756 fn test_summary_from_docstring() {
2757 let source = ChunkSource {
2758 repo: RepoIdentifier::default(),
2759 file: "src/auth/jwt.rs".to_owned(),
2760 lines: (10, 20),
2761 symbol: "verify_token".to_owned(),
2762 fqn: None,
2763 language: "Rust".to_owned(),
2764 parent: None,
2765 visibility: Visibility::Public,
2766 is_test: false,
2767 module_path: None,
2768 parent_chunk_id: None,
2769 };
2770 let context = ChunkContext {
2771 docstring: Some("/// Verify a JWT token and return the claims.".to_owned()),
2772 signature: Some("pub fn verify_token(token: &str) -> Result<Claims>".to_owned()),
2773 ..Default::default()
2774 };
2775
2776 let summary = generate_summary(ChunkKind::Function, &source, &context);
2777 assert_eq!(summary, Some("Verify a JWT token and return the claims.".to_owned()));
2778 }
2779
2780 #[test]
2781 fn test_summary_heuristic_for_function() {
2782 let source = ChunkSource {
2783 repo: RepoIdentifier::default(),
2784 file: "src/auth/jwt.rs".to_owned(),
2785 lines: (10, 20),
2786 symbol: "verify_token".to_owned(),
2787 fqn: None,
2788 language: "Rust".to_owned(),
2789 parent: None,
2790 visibility: Visibility::Public,
2791 is_test: false,
2792 module_path: None,
2793 parent_chunk_id: None,
2794 };
2795 let context = ChunkContext {
2796 signature: Some("pub fn verify_token(token: &str) -> Result<Claims>".to_owned()),
2797 ..Default::default()
2798 };
2799
2800 let summary = generate_summary(ChunkKind::Function, &source, &context);
2801 assert_eq!(
2802 summary,
2803 Some(
2804 "Public function 'verify_token' in auth::jwt -- pub fn verify_token(token: &str) -> Result<Claims>"
2805 .to_owned()
2806 )
2807 );
2808 }
2809
2810 #[test]
2811 fn test_summary_heuristic_for_struct() {
2812 let source = ChunkSource {
2813 repo: RepoIdentifier::default(),
2814 file: "lib/models/user.py".to_owned(),
2815 lines: (1, 30),
2816 symbol: "User".to_owned(),
2817 fqn: None,
2818 language: "Python".to_owned(),
2819 parent: None,
2820 visibility: Visibility::Public,
2821 is_test: false,
2822 module_path: None,
2823 parent_chunk_id: None,
2824 };
2825 let context = ChunkContext::default();
2826
2827 let summary = generate_summary(ChunkKind::Class, &source, &context);
2828 assert_eq!(summary, Some("Public class 'User' in models::user".to_owned()));
2829 }
2830
2831 #[test]
2832 fn test_summary_none_for_imports() {
2833 let source = ChunkSource {
2834 repo: RepoIdentifier::default(),
2835 file: "src/lib.rs".to_owned(),
2836 lines: (1, 5),
2837 symbol: "<imports>".to_owned(),
2838 fqn: None,
2839 language: "Rust".to_owned(),
2840 parent: None,
2841 visibility: Visibility::Public,
2842 is_test: false,
2843 module_path: None,
2844 parent_chunk_id: None,
2845 };
2846 let context = ChunkContext::default();
2847
2848 let summary = generate_summary(ChunkKind::Imports, &source, &context);
2849 assert!(summary.is_none(), "Import chunks should not have a summary");
2850 }
2851
2852 #[test]
2853 fn test_summary_long_signature_truncated() {
2854 let long_sig = format!(
2855 "pub fn process({})",
2856 (0..50)
2857 .map(|i| format!("arg{}: SomeVeryLongTypeName", i))
2858 .collect::<Vec<_>>()
2859 .join(", ")
2860 );
2861 let source = ChunkSource {
2862 repo: RepoIdentifier::default(),
2863 file: "src/processor.rs".to_owned(),
2864 lines: (1, 100),
2865 symbol: "process".to_owned(),
2866 fqn: None,
2867 language: "Rust".to_owned(),
2868 parent: None,
2869 visibility: Visibility::Private,
2870 is_test: false,
2871 module_path: None,
2872 parent_chunk_id: None,
2873 };
2874 let context = ChunkContext { signature: Some(long_sig), ..Default::default() };
2875
2876 let summary = generate_summary(ChunkKind::Function, &source, &context).unwrap();
2877 assert!(summary.contains("..."), "Long signature should be truncated with ellipsis");
2879 assert!(summary.len() < 350, "Summary should be concise, got len={}", summary.len());
2881 }
2882
2883 #[test]
2884 fn test_summary_top_level() {
2885 let source = ChunkSource {
2886 repo: RepoIdentifier::default(),
2887 file: "src/main.rs".to_owned(),
2888 lines: (1, 50),
2889 symbol: "<top_level>".to_owned(),
2890 fqn: None,
2891 language: "Rust".to_owned(),
2892 parent: None,
2893 visibility: Visibility::Public,
2894 is_test: false,
2895 module_path: None,
2896 parent_chunk_id: None,
2897 };
2898 let context = ChunkContext::default();
2899
2900 let summary = generate_summary(ChunkKind::TopLevel, &source, &context);
2901 assert_eq!(summary, Some("Top-level code in src/main.rs".to_owned()));
2902 }
2903
2904 #[test]
2905 fn test_file_path_to_module() {
2906 assert_eq!(file_path_to_module("src/auth/jwt.rs"), "auth::jwt");
2907 assert_eq!(file_path_to_module("lib/models/user.py"), "models::user");
2908 assert_eq!(file_path_to_module("main/app.ts"), "app");
2909 assert_eq!(file_path_to_module("other/deep/path.go"), "other::deep::path");
2910 }
2911
2912 #[test]
2913 fn test_strip_doc_markers() {
2914 assert_eq!(strip_doc_markers("/// Hello world"), "Hello world");
2915 assert_eq!(strip_doc_markers("//! Module doc"), "Module doc");
2916 assert_eq!(strip_doc_markers("/** Java doc */"), "Java doc");
2917 assert_eq!(strip_doc_markers("# Python doc"), "Python doc");
2918 assert_eq!(strip_doc_markers("\"\"\"Triple quoted\"\"\""), "Triple quoted");
2919 assert_eq!(strip_doc_markers(" * Javadoc line"), "Javadoc line");
2920 assert_eq!(strip_doc_markers("Plain text"), "Plain text");
2921 }
2922
2923 #[test]
2924 fn test_summary_with_python_docstring() {
2925 let source = ChunkSource {
2926 repo: RepoIdentifier::default(),
2927 file: "src/utils.py".to_owned(),
2928 lines: (1, 10),
2929 symbol: "parse_config".to_owned(),
2930 fqn: None,
2931 language: "Python".to_owned(),
2932 parent: None,
2933 visibility: Visibility::Public,
2934 is_test: false,
2935 module_path: None,
2936 parent_chunk_id: None,
2937 };
2938 let context = ChunkContext {
2939 docstring: Some("\"\"\"Parse configuration from a YAML file.\"\"\"".to_owned()),
2940 ..Default::default()
2941 };
2942
2943 let summary = generate_summary(ChunkKind::Function, &source, &context);
2944 assert_eq!(summary, Some("Parse configuration from a YAML file.".to_owned()));
2945 }
2946}