1use std::collections::VecDeque;
61use std::path::{Path, PathBuf};
62use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
63use std::sync::Arc;
64
65use crate::parser::{parse_file_symbols, Language};
66use crate::security::SecurityScanner;
67use crate::tokenizer::{TokenModel, Tokenizer};
68
69use super::chunker::{generate_summary, generate_tags_for_symbol};
70use super::complexity::compute_complexity;
71use super::error::EmbedError;
72use super::hasher::hash_content;
73use super::identifiers::extract_identifiers;
74use super::limits::ResourceLimits;
75use super::type_extraction::extract_types;
76use super::types::{ChunkContext, ChunkSource, EmbedChunk, EmbedSettings, RepoIdentifier};
77
78#[derive(Debug, Clone)]
80pub struct StreamConfig {
81 pub file_batch_size: usize,
83
84 pub chunk_buffer_size: usize,
86
87 pub skip_on_error: bool,
89
90 pub max_errors: usize,
92
93 pub parallel_batches: bool,
95}
96
97impl Default for StreamConfig {
98 fn default() -> Self {
99 Self {
100 file_batch_size: 50,
101 chunk_buffer_size: 200,
102 skip_on_error: true,
103 max_errors: 100,
104 parallel_batches: true,
105 }
106 }
107}
108
109#[derive(Debug, Clone, Default)]
111pub struct StreamStats {
112 pub total_files: usize,
114
115 pub files_processed: usize,
117
118 pub files_skipped: usize,
120
121 pub chunks_generated: usize,
123
124 pub bytes_processed: u64,
126
127 pub error_count: usize,
129}
130
131impl StreamStats {
132 pub fn progress_percent(&self) -> f64 {
134 if self.total_files == 0 {
135 return 100.0;
136 }
137 (self.files_processed as f64 / self.total_files as f64) * 100.0
138 }
139
140 pub fn estimated_chunks_remaining(&self) -> usize {
142 if self.files_processed == 0 {
143 return 0;
144 }
145 let rate = self.chunks_generated as f64 / self.files_processed as f64;
146 let remaining_files = self.total_files.saturating_sub(self.files_processed);
147 (remaining_files as f64 * rate) as usize
148 }
149}
150
151pub struct ChunkStream {
156 pending_files: VecDeque<PathBuf>,
158
159 chunk_buffer: VecDeque<Result<EmbedChunk, EmbedError>>,
161
162 repo_root: PathBuf,
164
165 settings: EmbedSettings,
167
168 limits: ResourceLimits,
170
171 config: StreamConfig,
173
174 tokenizer: Tokenizer,
176
177 security_scanner: Option<SecurityScanner>,
179
180 repo_id: RepoIdentifier,
182
183 stats: StreamStats,
185
186 cancelled: Arc<AtomicBool>,
188
189 error_count: AtomicUsize,
191}
192
193impl ChunkStream {
194 pub fn new(
196 repo_path: impl AsRef<Path>,
197 settings: EmbedSettings,
198 limits: ResourceLimits,
199 ) -> Result<Self, EmbedError> {
200 Self::with_config(repo_path, settings, limits, StreamConfig::default())
201 }
202
203 pub fn with_config(
205 repo_path: impl AsRef<Path>,
206 settings: EmbedSettings,
207 limits: ResourceLimits,
208 config: StreamConfig,
209 ) -> Result<Self, EmbedError> {
210 let repo_root = repo_path
211 .as_ref()
212 .canonicalize()
213 .map_err(|e| EmbedError::IoError {
214 path: repo_path.as_ref().to_path_buf(),
215 source: e,
216 })?;
217
218 if !repo_root.is_dir() {
219 return Err(EmbedError::NotADirectory { path: repo_root });
220 }
221
222 let security_scanner = if settings.scan_secrets {
224 Some(SecurityScanner::new())
225 } else {
226 None
227 };
228
229 let mut stream = Self {
230 pending_files: VecDeque::new(),
231 chunk_buffer: VecDeque::new(),
232 repo_root,
233 settings,
234 limits,
235 config,
236 tokenizer: Tokenizer::new(),
237 security_scanner,
238 repo_id: RepoIdentifier::default(),
239 stats: StreamStats::default(),
240 cancelled: Arc::new(AtomicBool::new(false)),
241 error_count: AtomicUsize::new(0),
242 };
243
244 stream.discover_files()?;
246
247 Ok(stream)
248 }
249
250 pub fn with_repo_id(mut self, repo_id: RepoIdentifier) -> Self {
252 self.repo_id = repo_id;
253 self
254 }
255
256 pub fn stats(&self) -> &StreamStats {
258 &self.stats
259 }
260
261 pub fn cancellation_handle(&self) -> CancellationHandle {
263 CancellationHandle { cancelled: Arc::clone(&self.cancelled) }
264 }
265
266 pub fn is_cancelled(&self) -> bool {
268 self.cancelled.load(Ordering::Relaxed)
269 }
270
271 fn discover_files(&mut self) -> Result<(), EmbedError> {
273 use glob::Pattern;
274 use ignore::WalkBuilder;
275
276 let include_patterns: Vec<Pattern> = self
278 .settings
279 .include_patterns
280 .iter()
281 .filter_map(|p| Pattern::new(p).ok())
282 .collect();
283
284 let exclude_patterns: Vec<Pattern> = self
285 .settings
286 .exclude_patterns
287 .iter()
288 .filter_map(|p| Pattern::new(p).ok())
289 .collect();
290
291 let walker = WalkBuilder::new(&self.repo_root)
292 .hidden(false)
293 .git_ignore(true)
294 .git_global(true)
295 .git_exclude(true)
296 .follow_links(false)
297 .build();
298
299 let mut files = Vec::new();
300
301 for entry in walker.flatten() {
302 let path = entry.path();
303
304 if !path.is_file() {
305 continue;
306 }
307
308 let relative = path
310 .strip_prefix(&self.repo_root)
311 .unwrap_or(path)
312 .to_string_lossy();
313
314 if !include_patterns.is_empty()
316 && !include_patterns.iter().any(|p| p.matches(&relative))
317 {
318 continue;
319 }
320
321 if exclude_patterns.iter().any(|p| p.matches(&relative)) {
323 continue;
324 }
325
326 let ext = match path.extension().and_then(|e| e.to_str()) {
328 Some(e) => e,
329 None => continue,
330 };
331
332 if Language::from_extension(ext).is_none() {
333 continue;
334 }
335
336 if !self.settings.include_tests && self.is_test_file(path) {
338 continue;
339 }
340
341 files.push(path.to_path_buf());
342 }
343
344 files.sort();
346
347 self.stats.total_files = files.len();
348 self.pending_files = files.into();
349
350 if !self.limits.check_file_count(self.stats.total_files) {
352 return Err(EmbedError::TooManyFiles {
353 count: self.stats.total_files,
354 max: self.limits.max_files,
355 });
356 }
357
358 Ok(())
359 }
360
361 fn is_test_file(&self, path: &Path) -> bool {
363 let path_str = path.to_string_lossy().to_lowercase();
364
365 path_str.contains("/tests/")
366 || path_str.contains("\\tests\\")
367 || path_str.contains("/test/")
368 || path_str.contains("\\test\\")
369 || path_str.contains("/__tests__/")
370 || path_str.contains("\\__tests__\\")
371 }
372
373 fn fill_buffer(&mut self) -> bool {
375 if self.is_cancelled() {
376 return false;
377 }
378
379 let batch_size = self.config.file_batch_size.min(self.pending_files.len());
381 if batch_size == 0 {
382 return false;
383 }
384
385 let batch: Vec<_> = (0..batch_size)
386 .filter_map(|_| self.pending_files.pop_front())
387 .collect();
388
389 for file_path in batch {
391 if self.is_cancelled() {
392 break;
393 }
394
395 match self.process_file(&file_path) {
396 Ok(chunks) => {
397 self.stats.files_processed += 1;
398 self.stats.chunks_generated += chunks.len();
399
400 for chunk in chunks {
401 self.chunk_buffer.push_back(Ok(chunk));
402 }
403 },
404 Err(e) => {
405 self.stats.error_count += 1;
406 let current_errors = self.error_count.fetch_add(1, Ordering::Relaxed) + 1;
407
408 if e.is_skippable() && self.config.skip_on_error {
409 self.stats.files_skipped += 1;
410 if !e.is_critical() {
412 self.chunk_buffer.push_back(Err(e));
413 }
414 } else if current_errors >= self.config.max_errors {
415 self.chunk_buffer.push_back(Err(EmbedError::TooManyErrors {
417 count: current_errors,
418 max: self.config.max_errors,
419 }));
420 self.cancelled.store(true, Ordering::Relaxed);
421 break;
422 } else if e.is_critical() {
423 self.chunk_buffer.push_back(Err(e));
424 break;
425 }
426 },
427 }
428 }
429
430 !self.chunk_buffer.is_empty() || !self.pending_files.is_empty()
431 }
432
433 fn process_file(&mut self, path: &Path) -> Result<Vec<EmbedChunk>, EmbedError> {
435 let metadata = std::fs::metadata(path)
437 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
438
439 if !self.limits.check_file_size(metadata.len()) {
440 return Err(EmbedError::FileTooLarge {
441 path: path.to_path_buf(),
442 size: metadata.len(),
443 max: self.limits.max_file_size,
444 });
445 }
446
447 let mut content = std::fs::read_to_string(path)
449 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
450
451 self.stats.bytes_processed += content.len() as u64;
452
453 if let Some(max_line_len) = content.lines().map(|l| l.len()).max() {
455 if !self.limits.check_line_length(max_line_len) {
456 return Err(EmbedError::LineTooLong {
457 path: path.to_path_buf(),
458 length: max_line_len,
459 max: self.limits.max_line_length,
460 });
461 }
462 }
463
464 let relative_path = self.safe_relative_path(path)?;
466
467 if let Some(ref scanner) = self.security_scanner {
468 let findings = scanner.scan(&content, &relative_path);
469 if !findings.is_empty() {
470 if self.settings.fail_on_secrets {
471 let files = findings
472 .iter()
473 .map(|f| format!(" {}:{} - {}", f.file, f.line, f.kind.name()))
474 .collect::<Vec<_>>()
475 .join("\n");
476 return Err(EmbedError::SecretsDetected { count: findings.len(), files });
477 }
478
479 if self.settings.redact_secrets {
480 content = scanner.redact_content(&content, &relative_path);
481 }
482 }
483 }
484
485 let language = self.detect_language(path);
487 let mut symbols = parse_file_symbols(&content, path);
488 symbols.sort_by(|a, b| {
489 a.start_line
490 .cmp(&b.start_line)
491 .then_with(|| a.end_line.cmp(&b.end_line))
492 .then_with(|| a.name.cmp(&b.name))
493 });
494
495 let lines: Vec<&str> = content.lines().collect();
496 let mut chunks = Vec::with_capacity(symbols.len());
497
498 for symbol in &symbols {
499 if !self.settings.include_imports
501 && matches!(symbol.kind, crate::types::SymbolKind::Import)
502 {
503 continue;
504 }
505
506 let start_line = symbol.start_line.saturating_sub(1) as usize;
508 let end_line = (symbol.end_line as usize).min(lines.len());
509 let context_start = start_line.saturating_sub(self.settings.context_lines as usize);
510 let context_end = (end_line + self.settings.context_lines as usize).min(lines.len());
511
512 let chunk_content = lines[context_start..context_end].join("\n");
513
514 let token_model = TokenModel::from_model_name(&self.settings.token_model)
516 .unwrap_or(TokenModel::Claude);
517 let tokens = self.tokenizer.count(&chunk_content, token_model);
518
519 let hash = hash_content(&chunk_content);
521
522 let fqn = self.compute_fqn(&relative_path, symbol);
524
525 let keywords = super::chunker::extract_keywords(&chunk_content);
527 let context_prefix = super::chunker::generate_context_prefix(
528 &relative_path,
529 symbol.parent.as_deref(),
530 &symbol.kind,
531 );
532
533 let lang_enum =
535 Language::from_extension(path.extension().and_then(|e| e.to_str()).unwrap_or(""));
536 let identifiers = extract_identifiers(&chunk_content, lang_enum);
537 let (type_signature, parameter_types, return_type, error_types) =
538 if let Some(lang) = lang_enum {
539 match extract_types(&chunk_content, lang) {
540 Some(ti) => {
541 (ti.type_signature, ti.parameter_types, ti.return_type, ti.error_types)
542 },
543 None => (None, Vec::new(), None, Vec::new()),
544 }
545 } else {
546 (None, Vec::new(), None, Vec::new())
547 };
548 let complexity_score = lang_enum.and_then(|l| compute_complexity(&chunk_content, l));
549 let tags = generate_tags_for_symbol(&symbol.name, symbol.signature.as_deref());
550
551 let chunk_kind = symbol.kind.into();
552 let source = ChunkSource {
553 repo: self.repo_id.clone(),
554 file: relative_path.clone(),
555 lines: ((context_start + 1) as u32, context_end as u32),
556 symbol: symbol.name.clone(),
557 fqn: Some(fqn),
558 language: language.clone(),
559 parent: symbol.parent.clone(),
560 visibility: symbol.visibility.into(),
561 is_test: self.is_test_code(path, symbol),
562 module_path: Some(super::chunker::derive_module_path(&relative_path, &language)),
563 parent_chunk_id: None,
564 };
565
566 let mut context = ChunkContext {
567 docstring: symbol.docstring.clone(),
568 comments: Vec::new(),
569 signature: symbol.signature.clone(),
570 calls: symbol.calls.clone(),
571 called_by: Vec::new(),
572 imports: Vec::new(),
573 tags,
574 keywords,
575 context_prefix: Some(context_prefix),
576 summary: None,
577 qualified_calls: Vec::new(),
578 unresolved_calls: Vec::new(),
579 identifiers,
580 type_signature,
581 parameter_types,
582 return_type,
583 error_types,
584 lines_of_code: chunk_content.lines().count() as u32,
585 max_nesting_depth: 0,
586 git: None,
587 complexity_score,
588 dependents_count: None,
589 };
590
591 context.summary = generate_summary(chunk_kind, &source, &context);
593
594 chunks.push(EmbedChunk {
595 id: hash.short_id,
596 full_hash: hash.full_hash,
597 content: chunk_content,
598 tokens,
599 kind: chunk_kind,
600 source,
601 children_ids: Vec::new(),
602 context,
603 repr: "code".to_owned(),
604 code_chunk_id: None,
605 part: None,
606 });
607 }
608
609 Ok(chunks)
610 }
611
612 fn safe_relative_path(&self, path: &Path) -> Result<String, EmbedError> {
614 let canonical = path
615 .canonicalize()
616 .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
617
618 if !canonical.starts_with(&self.repo_root) {
619 return Err(EmbedError::PathTraversal {
620 path: canonical,
621 repo_root: self.repo_root.clone(),
622 });
623 }
624
625 Ok(canonical
626 .strip_prefix(&self.repo_root)
627 .unwrap_or(&canonical)
628 .to_string_lossy()
629 .replace('\\', "/"))
630 }
631
632 fn detect_language(&self, path: &Path) -> String {
634 path.extension()
635 .and_then(|e| e.to_str())
636 .and_then(Language::from_extension)
637 .map_or_else(|| "unknown".to_owned(), |l| l.display_name().to_owned())
638 }
639
640 fn compute_fqn(&self, file: &str, symbol: &crate::types::Symbol) -> String {
642 let module_path = file
643 .strip_suffix(".rs")
644 .or_else(|| file.strip_suffix(".py"))
645 .or_else(|| file.strip_suffix(".ts"))
646 .or_else(|| file.strip_suffix(".tsx"))
647 .or_else(|| file.strip_suffix(".js"))
648 .or_else(|| file.strip_suffix(".jsx"))
649 .or_else(|| file.strip_suffix(".go"))
650 .unwrap_or(file)
651 .replace(['\\', '/'], "::"); let symbol_part = if let Some(ref parent) = symbol.parent {
655 format!("{}::{}::{}", module_path, parent, symbol.name)
656 } else {
657 format!("{}::{}", module_path, symbol.name)
658 };
659
660 let repo_prefix = self.repo_id.qualified_name();
662 if repo_prefix.is_empty() {
663 symbol_part
664 } else {
665 format!("{}::{}", repo_prefix, symbol_part)
666 }
667 }
668
669 fn is_test_code(&self, path: &Path, symbol: &crate::types::Symbol) -> bool {
671 let path_str = path.to_string_lossy().to_lowercase();
672 let name = symbol.name.to_lowercase();
673
674 path_str.contains("test")
675 || path_str.contains("spec")
676 || name.starts_with("test_")
677 || name.ends_with("_test")
678 }
679
680 pub fn collect_all(self) -> Result<Vec<EmbedChunk>, EmbedError> {
685 let mut chunks = Vec::new();
686 let mut last_error = None;
687
688 for result in self {
689 match result {
690 Ok(chunk) => chunks.push(chunk),
691 Err(e) if e.is_skippable() => {
692 },
694 Err(e) => {
695 last_error = Some(e);
696 },
697 }
698 }
699
700 if let Some(e) = last_error {
701 if chunks.is_empty() {
702 return Err(e);
703 }
704 }
705
706 chunks.sort_by(|a, b| {
708 a.source
709 .file
710 .cmp(&b.source.file)
711 .then_with(|| a.source.lines.0.cmp(&b.source.lines.0))
712 .then_with(|| a.source.lines.1.cmp(&b.source.lines.1))
713 .then_with(|| a.source.symbol.cmp(&b.source.symbol))
714 .then_with(|| a.id.cmp(&b.id))
715 });
716
717 Ok(chunks)
718 }
719}
720
721impl Iterator for ChunkStream {
722 type Item = Result<EmbedChunk, EmbedError>;
723
724 fn next(&mut self) -> Option<Self::Item> {
725 if let Some(chunk) = self.chunk_buffer.pop_front() {
727 return Some(chunk);
728 }
729
730 if self.fill_buffer() {
732 self.chunk_buffer.pop_front()
733 } else {
734 None
735 }
736 }
737
738 fn size_hint(&self) -> (usize, Option<usize>) {
739 let remaining = self.stats.estimated_chunks_remaining();
740 let buffered = self.chunk_buffer.len();
741 (buffered, Some(buffered + remaining))
742 }
743}
744
745#[derive(Clone)]
747pub struct CancellationHandle {
748 cancelled: Arc<AtomicBool>,
749}
750
751impl CancellationHandle {
752 pub fn cancel(&self) {
754 self.cancelled.store(true, Ordering::Relaxed);
755 }
756
757 pub fn is_cancelled(&self) -> bool {
759 self.cancelled.load(Ordering::Relaxed)
760 }
761}
762
763pub trait BatchIterator: Iterator {
765 fn batches(self, batch_size: usize) -> Batches<Self>
767 where
768 Self: Sized,
769 {
770 Batches { iter: self, batch_size }
771 }
772}
773
774impl<I: Iterator> BatchIterator for I {}
775
776pub struct Batches<I> {
778 iter: I,
779 batch_size: usize,
780}
781
782impl<I: Iterator> Iterator for Batches<I> {
783 type Item = Vec<I::Item>;
784
785 fn next(&mut self) -> Option<Self::Item> {
786 let mut batch = Vec::with_capacity(self.batch_size);
787
788 for _ in 0..self.batch_size {
789 match self.iter.next() {
790 Some(item) => batch.push(item),
791 None => break,
792 }
793 }
794
795 if batch.is_empty() {
796 None
797 } else {
798 Some(batch)
799 }
800 }
801}
802
803#[cfg(test)]
804mod tests {
805 use super::*;
806 use tempfile::TempDir;
807
808 fn create_test_file(dir: &Path, name: &str, content: &str) {
809 let path = dir.join(name);
810 if let Some(parent) = path.parent() {
811 std::fs::create_dir_all(parent).unwrap();
812 }
813 std::fs::write(path, content).unwrap();
814 }
815
816 #[test]
817 fn test_chunk_stream_basic() {
818 let temp_dir = TempDir::new().unwrap();
819 let rust_code = r#"
820/// A test function
821fn hello() {
822 println!("Hello, world!");
823}
824
825fn goodbye() {
826 println!("Goodbye!");
827}
828"#;
829 create_test_file(temp_dir.path(), "test.rs", rust_code);
830
831 let settings = EmbedSettings::default();
832 let limits = ResourceLimits::default();
833
834 let stream = ChunkStream::new(temp_dir.path(), settings, limits).unwrap();
835 let chunks: Vec<_> = stream.filter_map(|r| r.ok()).collect();
836
837 assert!(!chunks.is_empty());
838 }
839
840 #[test]
841 fn test_stream_stats() {
842 let temp_dir = TempDir::new().unwrap();
843 create_test_file(temp_dir.path(), "a.rs", "fn foo() {}");
844 create_test_file(temp_dir.path(), "b.rs", "fn bar() {}");
845 create_test_file(temp_dir.path(), "c.rs", "fn baz() {}");
846
847 let settings = EmbedSettings::default();
848 let limits = ResourceLimits::default();
849
850 let stream = ChunkStream::new(temp_dir.path(), settings, limits).unwrap();
851
852 assert_eq!(stream.stats().total_files, 3);
853
854 let _chunks: Vec<_> = stream.collect();
856 }
857
858 #[test]
859 fn test_cancellation() {
860 let temp_dir = TempDir::new().unwrap();
861 for i in 0..10 {
862 create_test_file(
863 temp_dir.path(),
864 &format!("file{}.rs", i),
865 &format!("fn func{}() {{}}", i),
866 );
867 }
868
869 let settings = EmbedSettings::default();
870 let limits = ResourceLimits::default();
871
872 let mut stream = ChunkStream::new(temp_dir.path(), settings, limits).unwrap();
873 let handle = stream.cancellation_handle();
874
875 let _ = stream.next();
877 let _ = stream.next();
878
879 handle.cancel();
881
882 assert!(stream.is_cancelled());
884 }
885
886 #[test]
887 fn test_batch_iterator() {
888 let items: Vec<i32> = (0..10).collect();
889 let batches: Vec<Vec<i32>> = items.into_iter().batches(3).collect();
890
891 assert_eq!(batches.len(), 4);
892 assert_eq!(batches[0], vec![0, 1, 2]);
893 assert_eq!(batches[1], vec![3, 4, 5]);
894 assert_eq!(batches[2], vec![6, 7, 8]);
895 assert_eq!(batches[3], vec![9]);
896 }
897
898 #[test]
899 fn test_collect_all_sorts_deterministically() {
900 let temp_dir = TempDir::new().unwrap();
901 create_test_file(temp_dir.path(), "z.rs", "fn z_func() {}");
902 create_test_file(temp_dir.path(), "a.rs", "fn a_func() {}");
903 create_test_file(temp_dir.path(), "m.rs", "fn m_func() {}");
904
905 let settings = EmbedSettings::default();
906 let limits = ResourceLimits::default();
907
908 let stream = ChunkStream::new(temp_dir.path(), settings, limits).unwrap();
909 let chunks = stream.collect_all().unwrap();
910
911 assert!(chunks[0].source.file < chunks[1].source.file);
913 assert!(chunks[1].source.file < chunks[2].source.file);
914 }
915
916 #[test]
917 fn test_stream_config() {
918 let config = StreamConfig {
919 file_batch_size: 10,
920 chunk_buffer_size: 50,
921 skip_on_error: true,
922 max_errors: 5,
923 parallel_batches: false,
924 };
925
926 let temp_dir = TempDir::new().unwrap();
927 create_test_file(temp_dir.path(), "test.rs", "fn test() {}");
928
929 let settings = EmbedSettings::default();
930 let limits = ResourceLimits::default();
931
932 let stream = ChunkStream::with_config(temp_dir.path(), settings, limits, config).unwrap();
933 let chunks: Vec<_> = stream.filter_map(|r| r.ok()).collect();
934
935 assert!(!chunks.is_empty());
936 }
937
938 #[test]
939 fn test_stream_with_repo_id() {
940 let temp_dir = TempDir::new().unwrap();
941 create_test_file(temp_dir.path(), "test.rs", "fn test() {}");
942
943 let settings = EmbedSettings::default();
944 let limits = ResourceLimits::default();
945 let repo_id = RepoIdentifier::new("github.com/test", "my-repo");
946
947 let stream = ChunkStream::new(temp_dir.path(), settings, limits)
948 .unwrap()
949 .with_repo_id(repo_id);
950
951 let chunks: Vec<_> = stream.filter_map(|r| r.ok()).collect();
952
953 assert!(!chunks.is_empty());
954 assert_eq!(chunks[0].source.repo.namespace.as_deref(), Some("github.com/test"));
955 assert_eq!(chunks[0].source.repo.name, "my-repo");
956 }
957
958 #[test]
964 fn test_extract_keywords_returns_domain_terms_not_language_keywords() {
965 let rust_code = r#"
966fn calculate_checksum(buffer: &[u8]) -> u64 {
967 let mut digest = 0u64;
968 for byte in buffer {
969 digest = digest.wrapping_mul(31).wrapping_add(*byte as u64);
970 }
971 digest
972}
973"#;
974 let keywords = super::super::chunker::extract_keywords(rust_code);
975
976 assert!(
978 keywords.contains(&"calculate".to_string()),
979 "Expected 'calculate' in keywords, got: {:?}",
980 keywords
981 );
982 assert!(
983 keywords.contains(&"checksum".to_string()),
984 "Expected 'checksum' in keywords, got: {:?}",
985 keywords
986 );
987 assert!(
988 keywords.contains(&"buffer".to_string()),
989 "Expected 'buffer' in keywords, got: {:?}",
990 keywords
991 );
992 assert!(
993 keywords.contains(&"digest".to_string()),
994 "Expected 'digest' in keywords, got: {:?}",
995 keywords
996 );
997
998 assert!(!keywords.contains(&"fn".to_string()), "'fn' should be filtered as a stopword");
1000 assert!(!keywords.contains(&"let".to_string()), "'let' should be filtered as a stopword");
1001 assert!(!keywords.contains(&"for".to_string()), "'for' should be filtered as a stopword");
1002 assert!(!keywords.contains(&"mut".to_string()), "'mut' should be filtered as a stopword");
1003 }
1004
1005 #[test]
1006 fn test_extract_keywords_handles_camel_case_and_snake_case() {
1007 let code = r#"
1008fn parse_http_response(rawBytes: &[u8]) -> HttpResponse {
1009 let contentLength = extract_content_length(rawBytes);
1010 HttpResponse::new(contentLength)
1011}
1012"#;
1013 let keywords = super::super::chunker::extract_keywords(code);
1014
1015 assert!(
1017 keywords.contains(&"parse".to_string()),
1018 "Expected 'parse' from snake_case split, got: {:?}",
1019 keywords
1020 );
1021 assert!(
1022 keywords.contains(&"http".to_string()),
1023 "Expected 'http' from snake_case split, got: {:?}",
1024 keywords
1025 );
1026 assert!(
1027 keywords.contains(&"response".to_string()),
1028 "Expected 'response' from identifier split, got: {:?}",
1029 keywords
1030 );
1031
1032 assert!(
1034 keywords.contains(&"content".to_string()),
1035 "Expected 'content' from camelCase split, got: {:?}",
1036 keywords
1037 );
1038 assert!(
1039 keywords.contains(&"length".to_string()),
1040 "Expected 'length' from camelCase split, got: {:?}",
1041 keywords
1042 );
1043 }
1044
1045 #[test]
1046 fn test_extract_keywords_nonempty_for_nontrivial_code() {
1047 let code = r#"
1049fn validate_user_credentials(username: &str, password: &str) -> bool {
1050 let stored_hash = fetch_password_hash(username);
1051 verify_hash(password, &stored_hash)
1052}
1053"#;
1054 let keywords = super::super::chunker::extract_keywords(code);
1055 assert!(!keywords.is_empty(), "Non-trivial code should produce at least some keywords");
1056 assert!(
1058 keywords.len() >= 3,
1059 "Expected at least 3 keywords for code with rich identifiers, got {}: {:?}",
1060 keywords.len(),
1061 keywords
1062 );
1063 }
1064
1065 #[test]
1066 fn test_generate_context_prefix_format_without_parent() {
1067 use crate::types::SymbolKind;
1068
1069 let prefix = super::super::chunker::generate_context_prefix(
1070 "src/auth.rs",
1071 None,
1072 &SymbolKind::Function,
1073 );
1074
1075 assert_eq!(prefix, "From src/auth.rs, function");
1076 }
1077
1078 #[test]
1079 fn test_generate_context_prefix_format_with_parent() {
1080 use crate::types::SymbolKind;
1081
1082 let prefix = super::super::chunker::generate_context_prefix(
1083 "src/models/user.rs",
1084 Some("UserService"),
1085 &SymbolKind::Method,
1086 );
1087
1088 assert_eq!(prefix, "From src/models/user.rs, in UserService, method");
1089 }
1090
1091 #[test]
1092 fn test_generate_context_prefix_various_kinds() {
1093 use crate::types::SymbolKind;
1094
1095 let cases = vec![
1096 (SymbolKind::Class, "class"),
1097 (SymbolKind::Struct, "struct"),
1098 (SymbolKind::Enum, "enum"),
1099 (SymbolKind::Trait, "trait"),
1100 (SymbolKind::Interface, "interface"),
1101 (SymbolKind::Constant, "constant"),
1102 (SymbolKind::Import, "import"),
1103 (SymbolKind::Module, "module"),
1104 (SymbolKind::Macro, "macro"),
1105 ];
1106
1107 for (kind, expected_name) in cases {
1108 let prefix = super::super::chunker::generate_context_prefix("src/lib.rs", None, &kind);
1109 assert_eq!(
1110 prefix,
1111 format!("From src/lib.rs, {expected_name}"),
1112 "Wrong prefix for kind {:?}",
1113 kind
1114 );
1115 }
1116 }
1117
1118 #[test]
1119 fn test_chunk_stream_populates_keywords_and_context_prefix() {
1120 let temp_dir = TempDir::new().unwrap();
1121 let rust_code = r#"
1122/// Validates and normalizes an email address
1123fn validate_email_address(input: &str) -> Option<String> {
1124 let trimmed = input.trim().to_lowercase();
1125 if trimmed.contains('@') && trimmed.contains('.') {
1126 Some(trimmed)
1127 } else {
1128 None
1129 }
1130}
1131"#;
1132 create_test_file(temp_dir.path(), "src/validator.rs", rust_code);
1133
1134 let settings = EmbedSettings::default();
1135 let limits = ResourceLimits::default();
1136
1137 let stream = ChunkStream::new(temp_dir.path(), settings, limits).unwrap();
1138 let chunks: Vec<_> = stream.filter_map(|r| r.ok()).collect();
1139
1140 assert!(!chunks.is_empty(), "Should produce at least one chunk");
1141
1142 for chunk in &chunks {
1143 assert!(
1145 !chunk.context.keywords.is_empty(),
1146 "Chunk '{}' has empty keywords; expected domain terms from the code",
1147 chunk.source.symbol
1148 );
1149
1150 assert!(
1152 chunk.context.context_prefix.is_some(),
1153 "Chunk '{}' has None context_prefix; expected 'From <path>, <kind>'",
1154 chunk.source.symbol
1155 );
1156
1157 let prefix = chunk.context.context_prefix.as_ref().unwrap();
1158
1159 assert!(
1161 prefix.starts_with("From "),
1162 "Context prefix should start with 'From ', got: {}",
1163 prefix
1164 );
1165 assert!(
1166 prefix.contains("validator.rs"),
1167 "Context prefix should reference the source file, got: {}",
1168 prefix
1169 );
1170 }
1171
1172 let email_chunk = chunks
1174 .iter()
1175 .find(|c| c.source.symbol == "validate_email_address");
1176 assert!(email_chunk.is_some(), "Should have a chunk for validate_email_address");
1177
1178 let email_chunk = email_chunk.unwrap();
1179
1180 let kw = &email_chunk.context.keywords;
1182 assert!(
1183 kw.contains(&"validate".to_string()) || kw.contains(&"email".to_string()),
1184 "Keywords for validate_email_address should include 'validate' or 'email', got: {:?}",
1185 kw
1186 );
1187
1188 let prefix = email_chunk.context.context_prefix.as_ref().unwrap();
1190 assert!(
1191 prefix.contains("function"),
1192 "Context prefix for a function should contain 'function', got: {}",
1193 prefix
1194 );
1195 }
1196
1197 #[test]
1198 fn test_chunk_stream_context_prefix_includes_parent_for_methods() {
1199 let temp_dir = TempDir::new().unwrap();
1200 let python_code = r#"
1202class DatabaseConnection:
1203 def execute_query(self, sql_statement):
1204 cursor = self.connection.cursor()
1205 cursor.execute(sql_statement)
1206 return cursor.fetchall()
1207"#;
1208 create_test_file(temp_dir.path(), "src/database.py", python_code);
1209
1210 let settings = EmbedSettings::default();
1211 let limits = ResourceLimits::default();
1212
1213 let stream = ChunkStream::new(temp_dir.path(), settings, limits).unwrap();
1214 let chunks: Vec<_> = stream.filter_map(|r| r.ok()).collect();
1215
1216 let method_chunk = chunks.iter().find(|c| c.source.symbol == "execute_query");
1218
1219 if let Some(chunk) = method_chunk {
1220 let prefix = chunk.context.context_prefix.as_ref().unwrap();
1221 if chunk.source.parent.is_some() {
1223 assert!(
1224 prefix.contains("in "),
1225 "Method with parent should have 'in <parent>' in prefix, got: {}",
1226 prefix
1227 );
1228 assert!(
1229 prefix.contains("DatabaseConnection"),
1230 "Parent should be 'DatabaseConnection', got: {}",
1231 prefix
1232 );
1233 }
1234 assert!(!chunk.context.keywords.is_empty(), "Method chunk should have keywords");
1236 }
1237 }
1240}