infiniloom_engine/embedding/
chunker.rs

1//! Core chunking logic for embedding generation
2//!
3//! This module generates deterministic, content-addressable code chunks from
4//! a repository. It uses thread-local parsers for parallel processing and
5//! enforces resource limits for DoS protection.
6//!
7//! # Thread Safety
8//!
9//! The chunker uses thread-local parsers to avoid mutex contention during
10//! parallel file processing. Each Rayon worker thread gets its own parser
11//! instance.
12//!
13//! # Determinism Guarantees
14//!
15//! 1. Files are processed in sorted lexicographic order
16//! 2. Symbols within files are sorted by (line, name)
17//! 3. Output chunks are sorted by (file, line, id)
18//! 4. All hash computations use integer-only math (no floats)
19
20use std::io::Write;
21use std::path::{Path, PathBuf};
22use std::sync::{
23    atomic::{AtomicUsize, Ordering},
24    Mutex,
25};
26
27use rayon::prelude::*;
28
29use crate::parser::{parse_file_symbols, Language};
30use crate::security::SecurityScanner;
31use crate::tokenizer::{TokenModel, Tokenizer};
32use crate::types::Symbol;
33
34use super::error::EmbedError;
35use super::git_enrichment::GitMetadataCollector;
36use super::hasher::hash_content;
37use super::hierarchy::{HierarchyBuilder, HierarchyConfig};
38use super::identifiers::extract_identifiers;
39use super::limits::ResourceLimits;
40use super::progress::ProgressReporter;
41use super::type_extraction;
42use super::types::{
43    default_repr, ChunkContext, ChunkKind, ChunkPart, ChunkSource, EmbedChunk, EmbedSettings,
44    RepoIdentifier, Visibility,
45};
46
47/// Statistics returned from streaming chunk generation
48#[derive(Debug, Clone, Default)]
49pub struct StreamingStats {
50    /// Total files discovered in the repository
51    pub total_files: usize,
52    /// Total files successfully processed
53    pub files_processed: usize,
54    /// Total files skipped due to non-critical errors
55    pub files_skipped: usize,
56    /// Total chunks written to output
57    pub total_chunks: usize,
58    /// Number of batches processed
59    pub batches_processed: usize,
60    /// Number of chunks with a parent name that could not be linked to a parent container
61    pub orphaned_chunks: u32,
62}
63
64/// Core chunker for generating embedding chunks
65pub struct EmbedChunker {
66    settings: EmbedSettings,
67    limits: ResourceLimits,
68    tokenizer: Tokenizer,
69    security_scanner: Option<SecurityScanner>,
70    /// Repository identifier for multi-tenant RAG
71    repo_id: RepoIdentifier,
72}
73
74impl EmbedChunker {
75    /// Create a new chunker with the given settings and limits
76    pub fn new(settings: EmbedSettings, limits: ResourceLimits) -> Self {
77        // Initialize security scanner if secret scanning is enabled
78        let security_scanner = if settings.scan_secrets {
79            Some(SecurityScanner::new())
80        } else {
81            None
82        };
83
84        Self {
85            settings,
86            limits,
87            tokenizer: Tokenizer::new(),
88            security_scanner,
89            repo_id: RepoIdentifier::default(),
90        }
91    }
92
93    /// Create a new chunker with default limits
94    pub fn with_defaults(settings: EmbedSettings) -> Self {
95        Self::new(settings, ResourceLimits::default())
96    }
97
98    /// Set the repository identifier for multi-tenant RAG
99    ///
100    /// This identifier is attached to all generated chunks, enabling:
101    /// - Multi-repository search with proper attribution
102    /// - Access control filtering by repository
103    /// - Cross-repository dependency tracking
104    ///
105    /// # Example
106    ///
107    /// ```rust,ignore
108    /// let chunker = EmbedChunker::with_defaults(settings)
109    ///     .with_repo_id(RepoIdentifier::new("github.com/myorg", "auth-service"));
110    /// ```
111    pub fn with_repo_id(mut self, repo_id: RepoIdentifier) -> Self {
112        self.repo_id = repo_id;
113        self
114    }
115
116    /// Set the repository identifier (mutable borrow version)
117    pub fn set_repo_id(&mut self, repo_id: RepoIdentifier) {
118        self.repo_id = repo_id;
119    }
120
121    /// Get the current repository identifier
122    pub fn repo_id(&self) -> &RepoIdentifier {
123        &self.repo_id
124    }
125
126    /// Generate chunks only for specific files in a repository
127    ///
128    /// This is used for git-diff-driven incremental updates where only changed
129    /// files need to be re-chunked. The `only_files` set contains relative paths
130    /// (from repo root) of files to process.
131    ///
132    /// # Guarantees
133    ///
134    /// Same as `chunk_repository`: deterministic, thread-safe, resource-limited.
135    pub fn chunk_repository_filtered(
136        &self,
137        repo_path: &Path,
138        only_files: &std::collections::HashSet<PathBuf>,
139        progress: &dyn ProgressReporter,
140    ) -> Result<Vec<EmbedChunk>, EmbedError> {
141        // Validate repo path
142        let repo_root = self.validate_repo_path(repo_path)?;
143
144        // Discover all files, then filter to only the specified ones
145        progress.set_phase("Scanning repository (filtered)...");
146        let mut files = self.discover_files(&repo_root)?;
147
148        // Filter to only files in the changed set (match by relative path)
149        files.retain(|f| {
150            if let Ok(rel) = f.strip_prefix(&repo_root) {
151                only_files.contains(rel)
152            } else {
153                false
154            }
155        });
156
157        files.sort(); // Critical for determinism
158
159        // Delegate to the shared chunking pipeline
160        self.chunk_files_impl(files, &repo_root, progress)
161    }
162
163    /// Populate repo identity from settings and git info.
164    ///
165    /// Uses `repo_namespace` and `repo_name` from settings if provided,
166    /// falling back to the directory name for `name`. Queries git for
167    /// branch and commit if the path is inside a git repository.
168    fn populate_repo_identity(&mut self, repo_path: &Path) {
169        // Only populate if the repo_id hasn't been explicitly set via with_repo_id
170        if !self.repo_id.name.is_empty() {
171            return;
172        }
173
174        let namespace = self.settings.repo_namespace.clone();
175        let name = self
176            .settings
177            .repo_name
178            .clone()
179            .or_else(|| {
180                repo_path
181                    .file_name()
182                    .and_then(|n| n.to_str())
183                    .map(String::from)
184            })
185            .unwrap_or_else(|| "unknown".to_owned());
186
187        // Try to get git branch and commit (best-effort, ignore errors)
188        let (branch, commit) = match crate::git::GitRepo::open(repo_path) {
189            Ok(git) => {
190                let branch = git.current_branch().ok();
191                let commit = git.current_commit().ok();
192                (branch, commit)
193            },
194            Err(_) => (None, None),
195        };
196
197        self.repo_id = RepoIdentifier { namespace, name, version: None, branch, commit };
198    }
199
200    /// Generate all chunks for a repository
201    ///
202    /// # Guarantees
203    ///
204    /// 1. Deterministic output (same input = same output)
205    /// 2. Thread-safe parallel processing
206    /// 3. Resource limits enforced
207    /// 4. Errors collected, not swallowed
208    pub fn chunk_repository(
209        &mut self,
210        repo_path: &Path,
211        progress: &dyn ProgressReporter,
212    ) -> Result<Vec<EmbedChunk>, EmbedError> {
213        // Validate repo path
214        let repo_root = self.validate_repo_path(repo_path)?;
215
216        // Build repo identity from settings and git info if not already set
217        self.populate_repo_identity(&repo_root);
218
219        // Phase 1: Discover files (deterministic order)
220        progress.set_phase("Scanning repository...");
221        let mut files = self.discover_files(&repo_root)?;
222        files.sort(); // Critical for determinism
223
224        self.chunk_files_impl(files, &repo_root, progress)
225    }
226
227    /// Shared implementation for chunking a list of files
228    fn chunk_files_impl(
229        &self,
230        files: Vec<PathBuf>,
231        repo_root: &Path,
232        progress: &dyn ProgressReporter,
233    ) -> Result<Vec<EmbedChunk>, EmbedError> {
234        progress.set_total(files.len());
235
236        if files.is_empty() {
237            return Err(EmbedError::NoChunksGenerated {
238                include_patterns: "default".to_owned(),
239                exclude_patterns: "default".to_owned(),
240            });
241        }
242
243        // Check file limit
244        if !self.limits.check_file_count(files.len()) {
245            return Err(EmbedError::TooManyFiles {
246                count: files.len(),
247                max: self.limits.max_files,
248            });
249        }
250
251        // Phase 2: Process files in parallel
252        progress.set_phase("Parsing and chunking...");
253        let chunk_count = Mutex::new(0usize);
254        let processed = AtomicUsize::new(0);
255
256        // Collect results AND errors (don't swallow errors)
257        let results: Vec<Result<Vec<EmbedChunk>, (PathBuf, EmbedError)>> = files
258            .par_iter()
259            .map(|file| {
260                let result = self.chunk_file(file, repo_root);
261
262                // Update progress
263                let done = processed.fetch_add(1, Ordering::Relaxed) + 1;
264                progress.set_progress(done);
265
266                match result {
267                    Ok(chunks) => {
268                        // Atomically check and update chunk count to prevent race conditions
269                        // Use Mutex to ensure thread-safe limit enforcement
270                        let chunks_to_add = chunks.len();
271                        let mut count = chunk_count.lock().unwrap_or_else(|e| e.into_inner());
272                        let new_count = *count + chunks_to_add;
273
274                        // Check chunk limit BEFORE incrementing
275                        if !self.limits.check_chunk_count(new_count) {
276                            return Err((
277                                file.clone(),
278                                EmbedError::TooManyChunks {
279                                    count: new_count,
280                                    max: self.limits.max_total_chunks,
281                                },
282                            ));
283                        }
284
285                        *count = new_count;
286                        drop(count); // Release lock before returning
287
288                        Ok(chunks)
289                    },
290                    Err(e) => Err((file.clone(), e)),
291                }
292            })
293            .collect();
294
295        // Separate successes and failures
296        let mut all_chunks = Vec::new();
297        let mut errors = Vec::new();
298
299        for result in results {
300            match result {
301                Ok(chunks) => all_chunks.extend(chunks),
302                Err((path, err)) => errors.push((path, err)),
303            }
304        }
305
306        // Report errors (fail on critical, warn on non-critical)
307        if !errors.is_empty() {
308            let critical: Vec<_> = errors
309                .iter()
310                .filter(|(_, e)| e.is_critical())
311                .cloned()
312                .collect();
313
314            if !critical.is_empty() {
315                return Err(EmbedError::from_file_errors(critical));
316            }
317
318            // Non-critical errors: log warning, continue
319            for (path, err) in &errors {
320                if err.is_skippable() {
321                    progress.warn(&format!("Skipped {}: {}", path.display(), err));
322                }
323            }
324        }
325
326        // Check if any chunks were generated
327        if all_chunks.is_empty() {
328            return Err(EmbedError::NoChunksGenerated {
329                include_patterns: "default".to_owned(),
330                exclude_patterns: "default".to_owned(),
331            });
332        }
333
334        // Phase 3: Build reverse call graph (called_by + dependents_count)
335        progress.set_phase("Building call graph...");
336        self.populate_called_by(&mut all_chunks);
337
338        // Phase 3b: Link parent/children chunk IDs
339        progress.set_phase("Linking parent/children chunks...");
340        self.link_parent_children(&mut all_chunks, progress);
341
342        // Phase 4: Build hierarchy summaries (if enabled)
343        if self.settings.enable_hierarchy {
344            progress.set_phase("Building hierarchy summaries...");
345            let hierarchy_config = HierarchyConfig {
346                min_children_for_summary: self.settings.hierarchy_min_children,
347                ..Default::default()
348            };
349            let builder = HierarchyBuilder::with_config(hierarchy_config);
350
351            // Enrich existing chunks with hierarchy metadata tags
352            builder.enrich_chunks(&mut all_chunks);
353
354            // Generate summary chunks for containers (classes, structs, etc.)
355            let mut summaries = builder.build_hierarchy(&all_chunks);
356
357            // Count tokens for summary chunks
358            let token_model = self.parse_token_model(&self.settings.token_model);
359            for summary in &mut summaries {
360                summary.tokens = self.tokenizer.count(&summary.content, token_model);
361            }
362
363            all_chunks.extend(summaries);
364        }
365
366        // Phase 5: Generate signature-only chunks (if enabled)
367        if self.settings.include_signatures {
368            progress.set_phase("Generating signature chunks...");
369            let signature_chunks = self.generate_signature_chunks(&all_chunks);
370            all_chunks.extend(signature_chunks);
371        }
372
373        // Phase 6: Sort for deterministic output
374        // Note: par_sort_by is unstable, but our comparison uses multiple tiebreakers
375        // to guarantee no two elements ever compare equal, making stability irrelevant.
376        // Order: file → start line → end line → symbol name → chunk ID
377        progress.set_phase("Sorting chunks...");
378        all_chunks.par_sort_by(|a, b| {
379            a.source
380                .file
381                .cmp(&b.source.file)
382                .then_with(|| a.source.lines.0.cmp(&b.source.lines.0))
383                .then_with(|| a.source.lines.1.cmp(&b.source.lines.1))
384                .then_with(|| a.source.symbol.cmp(&b.source.symbol))
385                .then_with(|| a.id.cmp(&b.id)) // Content-addressable ID as final tiebreaker
386        });
387
388        // Phase 6: Enrich with git metadata (if enabled)
389        if self.settings.git_metadata {
390            progress.set_phase("Collecting git metadata...");
391            self.enrich_with_git_metadata(&mut all_chunks, repo_root);
392        }
393
394        progress.set_phase("Complete");
395        Ok(all_chunks)
396    }
397
398    /// Enrich chunks with git metadata (change frequency, authors, last modified).
399    ///
400    /// Uses a per-file cache so each file is only queried once via git commands.
401    fn enrich_with_git_metadata(&self, chunks: &mut [EmbedChunk], repo_root: &Path) {
402        let mut collector = match GitMetadataCollector::new(repo_root) {
403            Some(c) => c,
404            None => return, // Not a git repo, skip silently
405        };
406
407        for chunk in chunks.iter_mut() {
408            let metadata = collector.get_metadata(&chunk.source.file);
409            chunk.context.git = Some(metadata);
410        }
411    }
412    /// Generate chunks in streaming mode, writing each batch to a writer as JSONL.
413    ///
414    /// Instead of collecting all chunks into memory, this method processes files in
415    /// batches, writes each batch's chunks to the writer, then drops them. This
416    /// reduces peak memory from O(all chunks) to O(batch_size worth of chunks).
417    ///
418    /// # Determinism
419    ///
420    /// - Files are sorted lexicographically before batching, so batches are processed
421    ///   in a deterministic order.
422    /// - Within each batch, chunks are sorted by (file, start_line, end_line, symbol, id).
423    /// - Across batch boundaries, ordering is NOT globally sorted (batch N's last chunk
424    ///   may sort after batch N+1's first chunk). For full global sorting, use the
425    ///   non-streaming `chunk_repository()` method.
426    ///
427    /// # `called_by` trade-off
428    ///
429    /// The reverse call graph (`called_by` field) is populated within each batch only.
430    /// Cross-batch `called_by` references are not captured because that would require
431    /// keeping all chunks in memory. For full `called_by` coverage, use the
432    /// non-streaming `chunk_repository()` method.
433    ///
434    /// # Writer protocol
435    ///
436    /// Each chunk is serialized as a single JSON line (JSONL) via `serde_json`. The
437    /// caller is responsible for writing any header/footer lines around the chunks.
438    pub fn chunk_repository_streaming<W: Write>(
439        &self,
440        repo_path: &Path,
441        writer: &mut W,
442        progress: &dyn ProgressReporter,
443    ) -> Result<StreamingStats, EmbedError> {
444        // Validate repo path
445        let repo_root = self.validate_repo_path(repo_path)?;
446
447        // Phase 1: Discover files (deterministic order)
448        progress.set_phase("Scanning repository...");
449        let mut files = self.discover_files(&repo_root)?;
450        files.sort(); // Critical for determinism
451        progress.set_total(files.len());
452
453        if files.is_empty() {
454            return Err(EmbedError::NoChunksGenerated {
455                include_patterns: "default".to_owned(),
456                exclude_patterns: "default".to_owned(),
457            });
458        }
459
460        // Check file limit
461        if !self.limits.check_file_count(files.len()) {
462            return Err(EmbedError::TooManyFiles {
463                count: files.len(),
464                max: self.limits.max_files,
465            });
466        }
467
468        let batch_size = if self.settings.batch_size == 0 {
469            500
470        } else {
471            self.settings.batch_size
472        };
473
474        let mut stats = StreamingStats { total_files: files.len(), ..Default::default() };
475
476        // Phase 2: Process files in batches
477        progress.set_phase("Parsing and chunking (streaming)...");
478        let total_chunk_count = Mutex::new(0usize);
479
480        for batch_files in files.chunks(batch_size) {
481            let processed_in_batch = AtomicUsize::new(0);
482
483            // Process this batch in parallel (same logic as chunk_repository)
484            let results: Vec<Result<Vec<EmbedChunk>, (PathBuf, EmbedError)>> = batch_files
485                .par_iter()
486                .map(|file| {
487                    let result = self.chunk_file(file, &repo_root);
488
489                    let done = processed_in_batch.fetch_add(1, Ordering::Relaxed) + 1;
490                    let global_done = stats.files_processed + done;
491                    progress.set_progress(global_done);
492
493                    match result {
494                        Ok(chunks) => {
495                            // Atomically check and update chunk count to prevent race conditions
496                            // Use Mutex to ensure thread-safe limit enforcement
497                            let chunks_to_add = chunks.len();
498                            let mut count =
499                                total_chunk_count.lock().unwrap_or_else(|e| e.into_inner());
500                            let new_count = *count + chunks_to_add;
501
502                            // Check chunk limit BEFORE incrementing
503                            if !self.limits.check_chunk_count(new_count) {
504                                return Err((
505                                    file.clone(),
506                                    EmbedError::TooManyChunks {
507                                        count: new_count,
508                                        max: self.limits.max_total_chunks,
509                                    },
510                                ));
511                            }
512
513                            *count = new_count;
514                            drop(count); // Release lock before returning
515
516                            Ok(chunks)
517                        },
518                        Err(e) => Err((file.clone(), e)),
519                    }
520                })
521                .collect();
522
523            // Separate successes and failures for this batch
524            let mut batch_chunks = Vec::new();
525
526            for result in results {
527                match result {
528                    Ok(chunks) => {
529                        stats.files_processed += 1;
530                        batch_chunks.extend(chunks);
531                    },
532                    Err((_path, err)) => {
533                        if err.is_critical() {
534                            return Err(err);
535                        }
536                        if err.is_skippable() {
537                            stats.files_skipped += 1;
538                            progress.warn(&format!("Skipped: {}", err));
539                        }
540                    },
541                }
542            }
543
544            // Populate called_by within this batch only.
545            // NOTE: Cross-batch called_by references are not captured in streaming
546            // mode. This is an intentional trade-off to avoid keeping all chunks in
547            // memory. For full called_by coverage, use chunk_repository() instead.
548            self.populate_called_by(&mut batch_chunks);
549
550            // Link parent/children chunk IDs within this batch
551            self.link_parent_children(&mut batch_chunks, progress);
552
553            // Sort chunks within this batch for deterministic intra-batch ordering
554            batch_chunks.sort_by(|a, b| {
555                a.source
556                    .file
557                    .cmp(&b.source.file)
558                    .then_with(|| a.source.lines.0.cmp(&b.source.lines.0))
559                    .then_with(|| a.source.lines.1.cmp(&b.source.lines.1))
560                    .then_with(|| a.source.symbol.cmp(&b.source.symbol))
561                    .then_with(|| a.id.cmp(&b.id))
562            });
563
564            // Write each chunk as a JSONL line, then drop the batch
565            for chunk in &batch_chunks {
566                let chunk_json = serde_json::json!({
567                    "type": "chunk",
568                    "data": chunk,
569                });
570                let line = serde_json::to_string(&chunk_json).map_err(|e| EmbedError::IoError {
571                    path: repo_path.to_path_buf(),
572                    source: std::io::Error::other(e),
573                })?;
574                writeln!(writer, "{}", line).map_err(|e| EmbedError::IoError {
575                    path: repo_path.to_path_buf(),
576                    source: e,
577                })?;
578            }
579
580            stats.total_chunks += batch_chunks.len();
581            stats.batches_processed += 1;
582
583            // Flush writer after each batch to prevent unbounded memory buildup
584            // and ensure data is written even if processing fails later
585            writer
586                .flush()
587                .map_err(|e| EmbedError::IoError { path: repo_path.to_path_buf(), source: e })?;
588
589            // batch_chunks is dropped here, freeing memory
590        }
591
592        if stats.total_chunks == 0 {
593            return Err(EmbedError::NoChunksGenerated {
594                include_patterns: "default".to_owned(),
595                exclude_patterns: "default".to_owned(),
596            });
597        }
598
599        progress.set_phase("Complete");
600        Ok(stats)
601    }
602    /// Populate the called_by field for all chunks by building a reverse call graph.
603    ///
604    /// This method first runs import-aware resolution to populate `qualified_calls`
605    /// and `unresolved_calls`, then builds the reverse map for `called_by` using
606    /// both qualified and unqualified call names.
607    fn populate_called_by(&self, chunks: &mut [EmbedChunk]) {
608        use super::import_resolver::ImportResolver;
609        use std::collections::{BTreeMap, BTreeSet};
610
611        // Phase A: Resolve calls via imports (populates qualified_calls / unresolved_calls)
612        let resolver = ImportResolver::from_chunks(chunks);
613        resolver.resolve_all_calls(chunks);
614
615        // Phase B: Build reverse call maps
616        // 1. Qualified reverse map (import-resolved, more accurate)
617        let qualified_reverse = resolver.build_qualified_reverse_map(chunks);
618
619        // 2. Unqualified reverse map (fallback for unresolved calls, backward-compatible)
620        let mut reverse_calls: BTreeMap<String, BTreeSet<String>> = BTreeMap::new();
621        for chunk in chunks.iter() {
622            let caller_fqn = chunk.source.fqn.as_deref().unwrap_or(&chunk.source.symbol);
623            for callee in &chunk.context.calls {
624                reverse_calls
625                    .entry(callee.clone())
626                    .or_default()
627                    .insert(caller_fqn.to_owned());
628            }
629        }
630
631        // Phase C: Populate called_by using both maps
632        for chunk in chunks.iter_mut() {
633            let fqn = chunk.source.fqn.as_deref().unwrap_or("");
634            let symbol = &chunk.source.symbol;
635            let file = &chunk.source.file;
636
637            let mut called_by_set: BTreeSet<String> = BTreeSet::new();
638
639            // Check qualified reverse map: look for "file::symbol" as a callee
640            let qualified_key = format!("{}::{}", file, symbol);
641            if let Some(callers) = qualified_reverse.get(&qualified_key) {
642                called_by_set.extend(callers.iter().cloned());
643            }
644
645            // Also check by FQN in the qualified map
646            if !fqn.is_empty() {
647                if let Some(callers) = qualified_reverse.get(fqn) {
648                    called_by_set.extend(callers.iter().cloned());
649                }
650            }
651
652            // Fallback: unqualified reverse map (for unresolved calls and backward compat)
653            if let Some(callers) = reverse_calls.get(fqn) {
654                called_by_set.extend(callers.iter().cloned());
655            }
656            if let Some(callers) = reverse_calls.get(symbol) {
657                called_by_set.extend(callers.iter().cloned());
658            }
659
660            chunk.context.called_by = called_by_set.into_iter().collect();
661
662            // Set dependents_count from called_by length
663            let count = chunk.context.called_by.len() as u32;
664            if count > 0 {
665                chunk.context.dependents_count = Some(count);
666            }
667        }
668    }
669
670    /// Link parent and children chunks by setting parent_chunk_id and children_ids
671    ///
672    /// For each chunk with `source.parent` set, find the corresponding container chunk
673    /// (Class/Struct/Enum/Trait/Interface) and set bidirectional links:
674    /// - child's `source.parent_chunk_id` = parent's chunk ID
675    /// - parent's `children_ids` includes child's chunk ID
676    ///
677    /// Emits an aggregate warning via the progress reporter if any chunks reference
678    /// a parent container that was not found (orphaned chunks).
679    fn link_parent_children(&self, chunks: &mut [EmbedChunk], progress: &dyn ProgressReporter) {
680        use std::collections::{BTreeMap, BTreeSet};
681
682        // Build map: (file, symbol_name) -> chunk index for container types
683        let mut container_map: BTreeMap<(String, String), usize> = BTreeMap::new();
684        for (i, chunk) in chunks.iter().enumerate() {
685            if matches!(
686                chunk.kind,
687                ChunkKind::Class
688                    | ChunkKind::Struct
689                    | ChunkKind::Enum
690                    | ChunkKind::Trait
691                    | ChunkKind::Interface
692            ) {
693                container_map.insert((chunk.source.file.clone(), chunk.source.symbol.clone()), i);
694            }
695        }
696
697        // First pass: set parent_chunk_id on children, collect children per parent
698        let mut parent_children: BTreeMap<usize, Vec<String>> = BTreeMap::new();
699        let mut orphaned_count: u32 = 0;
700        let mut orphaned_files: BTreeSet<String> = BTreeSet::new();
701
702        for i in 0..chunks.len() {
703            if let Some(ref parent_name) = chunks[i].source.parent {
704                let key = (chunks[i].source.file.clone(), parent_name.clone());
705                if let Some(&parent_idx) = container_map.get(&key) {
706                    let parent_id = chunks[parent_idx].id.clone();
707                    chunks[i].source.parent_chunk_id = Some(parent_id);
708
709                    parent_children
710                        .entry(parent_idx)
711                        .or_default()
712                        .push(chunks[i].id.clone());
713                } else {
714                    orphaned_count += 1;
715                    orphaned_files.insert(chunks[i].source.file.clone());
716                }
717            }
718        }
719
720        // Emit aggregate warning for orphaned chunks
721        if orphaned_count > 0 {
722            progress.warn(&format!(
723                "{} chunks have missing parent containers across {} files",
724                orphaned_count,
725                orphaned_files.len()
726            ));
727        }
728
729        // Second pass: set children_ids on parents (sorted for determinism)
730        for (parent_idx, mut child_ids) in parent_children {
731            child_ids.sort();
732            chunks[parent_idx].children_ids = child_ids;
733        }
734    }
735
736    /// Generate signature-only chunks for code chunks that have signatures
737    ///
738    /// For each code chunk with a `signature` in its context, creates a compact
739    /// signature-only chunk with:
740    /// - `repr` = "signature"
741    /// - `code_chunk_id` = the original code chunk's ID
742    /// - Content = just the signature string
743    /// - Minimal context (signature, docstring only)
744    ///
745    /// This enables tiered retrieval: search signatures broadly (cheap), then
746    /// fetch full code for top matches (expensive).
747    fn generate_signature_chunks(&self, chunks: &[EmbedChunk]) -> Vec<EmbedChunk> {
748        let token_model = self.parse_token_model(&self.settings.token_model);
749
750        chunks
751            .iter()
752            .filter(|chunk| {
753                // Only generate signature chunks for code chunks that have signatures
754                chunk.repr == "code"
755                    && chunk.code_chunk_id.is_none()
756                    && chunk.part.is_none() // Skip split parts (parent already has signature)
757                    && chunk.context.signature.is_some()
758                    && !matches!(chunk.kind, ChunkKind::Imports | ChunkKind::TopLevel)
759            })
760            .filter_map(|chunk| {
761                let signature = chunk.context.signature.as_ref()?;
762                let hash = hash_content(signature);
763                let tokens = self.tokenizer.count(signature, token_model);
764
765                Some(EmbedChunk {
766                    id: hash.short_id,
767                    full_hash: hash.full_hash,
768                    content: signature.clone(),
769                    tokens,
770                    kind: chunk.kind,
771                    source: chunk.source.clone(),
772                    context: ChunkContext {
773                        signature: chunk.context.signature.clone(),
774                        docstring: chunk.context.docstring.clone(),
775                        context_prefix: chunk.context.context_prefix.clone(),
776                        ..Default::default()
777                    },
778                    children_ids: Vec::new(),
779                    repr: "signature".to_owned(),
780                    code_chunk_id: Some(chunk.id.clone()),
781                    part: None,
782                })
783            })
784            .collect()
785    }
786
787    /// Chunk a single file using thread-local resources
788    fn chunk_file(&self, path: &Path, repo_root: &Path) -> Result<Vec<EmbedChunk>, EmbedError> {
789        // Validate file size
790        let metadata = std::fs::metadata(path)
791            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
792
793        if !self.limits.check_file_size(metadata.len()) {
794            return Err(EmbedError::FileTooLarge {
795                path: path.to_path_buf(),
796                size: metadata.len(),
797                max: self.limits.max_file_size,
798            });
799        }
800
801        // Read file
802        let mut content = std::fs::read_to_string(path)
803            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
804
805        // Check for excessively long lines (e.g., minified files)
806        // This prevents memory issues from processing single-line 10MB files
807        if let Some(max_line_len) = content.lines().map(|l| l.len()).max() {
808            if !self.limits.check_line_length(max_line_len) {
809                return Err(EmbedError::LineTooLong {
810                    path: path.to_path_buf(),
811                    length: max_line_len,
812                    max: self.limits.max_line_length,
813                });
814            }
815        }
816
817        // Get relative path (safe, validated)
818        let relative_path = self.safe_relative_path(path, repo_root)?;
819
820        // Security scanning (if enabled)
821        if let Some(ref scanner) = self.security_scanner {
822            let findings = scanner.scan(&content, &relative_path);
823            if !findings.is_empty() {
824                // Check if we should fail on secrets
825                if self.settings.fail_on_secrets {
826                    let files = findings
827                        .iter()
828                        .map(|f| format!("  {}:{} - {}", f.file, f.line, f.kind.name()))
829                        .collect::<Vec<_>>()
830                        .join("\n");
831                    return Err(EmbedError::SecretsDetected { count: findings.len(), files });
832                }
833
834                // Redact secrets if configured
835                if self.settings.redact_secrets {
836                    content = scanner.redact_content(&content, &relative_path);
837                }
838            }
839        }
840        let language = self.detect_language(path);
841        let lang_enum = self.detect_language_enum(path);
842
843        // Use thread-local parser (from parser module)
844        let mut symbols = parse_file_symbols(&content, path);
845
846        // Sort symbols deterministically (stable sort preserves parser order for equal elements)
847        symbols.sort_by(|a, b| {
848            a.start_line
849                .cmp(&b.start_line)
850                .then_with(|| a.end_line.cmp(&b.end_line))
851                .then_with(|| a.name.cmp(&b.name))
852        });
853
854        let lines: Vec<&str> = content.lines().collect();
855        let mut chunks = Vec::with_capacity(symbols.len() + 2);
856
857        for symbol in &symbols {
858            // Skip imports if configured
859            if !self.settings.include_imports
860                && matches!(symbol.kind, crate::types::SymbolKind::Import)
861            {
862                continue;
863            }
864
865            // Extract content with context
866            let (chunk_content, start_line, end_line) =
867                self.extract_symbol_content(&lines, symbol, self.settings.context_lines);
868
869            // Count tokens
870            let token_model = self.parse_token_model(&self.settings.token_model);
871            let tokens = self.tokenizer.count(&chunk_content, token_model);
872
873            // Handle large symbols (with depth-limited splitting)
874            if self.settings.max_tokens > 0 && tokens > self.settings.max_tokens {
875                let split_chunks = self.split_large_symbol(
876                    &chunk_content,
877                    symbol,
878                    &relative_path,
879                    &language,
880                    start_line,
881                    0, // Initial depth
882                    lang_enum,
883                )?;
884                chunks.extend(split_chunks);
885            } else {
886                // Generate hash (single pass)
887                let hash = hash_content(&chunk_content);
888
889                // Extract context (with complexity metrics)
890                let mut context =
891                    self.extract_context(symbol, &chunk_content, &relative_path, path);
892
893                // Compute fully qualified name for symbol disambiguation
894                let fqn = self.compute_fqn(&relative_path, symbol);
895
896                let chunk_kind: ChunkKind = symbol.kind.into();
897                let source = ChunkSource {
898                    repo: self.repo_id.clone(),
899                    file: relative_path.clone(),
900                    lines: (start_line, end_line),
901                    symbol: symbol.name.clone(),
902                    fqn: Some(fqn),
903                    language: language.clone(),
904                    parent: symbol.parent.clone(),
905                    visibility: symbol.visibility.into(),
906                    is_test: self.is_test_code(path, symbol),
907                    module_path: Some(derive_module_path(&relative_path, &language)),
908                    parent_chunk_id: None,
909                };
910
911                // Generate natural language summary
912                context.summary = generate_summary(chunk_kind, &source, &context);
913
914                chunks.push(EmbedChunk {
915                    id: hash.short_id,
916                    full_hash: hash.full_hash,
917                    content: chunk_content,
918                    tokens,
919                    kind: chunk_kind,
920                    source,
921                    context,
922                    children_ids: Vec::new(),
923                    repr: default_repr(),
924                    code_chunk_id: None,
925                    part: None,
926                });
927            }
928        }
929
930        // Handle top-level code if configured
931        if self.settings.include_top_level && !symbols.is_empty() {
932            if let Some(top_level) =
933                self.extract_top_level(&lines, &symbols, &relative_path, &language, lang_enum)
934            {
935                chunks.push(top_level);
936            }
937        }
938
939        Ok(chunks)
940    }
941
942    /// Extract symbol content with context lines
943    fn extract_symbol_content(
944        &self,
945        lines: &[&str],
946        symbol: &Symbol,
947        context_lines: u32,
948    ) -> (String, u32, u32) {
949        // Convert to 0-indexed, clamped to bounds
950        let start_line = symbol.start_line.saturating_sub(1) as usize;
951        let end_line = (symbol.end_line as usize).min(lines.len());
952
953        // Add context lines (clamped)
954        let context_start = start_line.saturating_sub(context_lines as usize);
955        let context_end = (end_line + context_lines as usize).min(lines.len());
956
957        // Extract content
958        let content = lines[context_start..context_end].join("\n");
959
960        // Return 1-indexed line numbers
961        (content, (context_start + 1) as u32, context_end as u32)
962    }
963
964    /// Split a large symbol into multiple chunks at line boundaries
965    ///
966    /// This implements overlap between consecutive chunks for context continuity.
967    /// Each chunk (except the first) includes `overlap_tokens` worth of lines from
968    /// the end of the previous chunk. This helps RAG systems understand context
969    /// when retrieving individual chunks.
970    fn split_large_symbol(
971        &self,
972        content: &str,
973        symbol: &Symbol,
974        file: &str,
975        language: &str,
976        base_line: u32,
977        depth: u32,
978        lang_enum: Option<Language>,
979    ) -> Result<Vec<EmbedChunk>, EmbedError> {
980        // Depth limit to prevent stack overflow
981        if !self.limits.check_recursion_depth(depth) {
982            return Err(EmbedError::RecursionLimitExceeded {
983                depth,
984                max: self.limits.max_recursion_depth,
985                context: format!("splitting symbol {}", symbol.name),
986            });
987        }
988
989        let lines: Vec<&str> = content.lines().collect();
990        let total_lines = lines.len();
991
992        // Calculate target lines per chunk using INTEGER math only
993        let token_model = self.parse_token_model(&self.settings.token_model);
994        let total_tokens = self.tokenizer.count(content, token_model) as usize;
995        let target_tokens = self.settings.max_tokens as usize;
996
997        if total_tokens == 0 || target_tokens == 0 {
998            return Ok(Vec::new());
999        }
1000
1001        // INTEGER division: (total_lines * target_tokens) / total_tokens
1002        let target_lines = ((total_lines * target_tokens) / total_tokens).max(1);
1003
1004        // Calculate overlap lines from overlap_tokens setting
1005        // Estimate: overlap_lines = (total_lines * overlap_tokens) / total_tokens
1006        let overlap_tokens = self.settings.overlap_tokens as usize;
1007        let overlap_lines = if overlap_tokens > 0 && total_tokens > 0 {
1008            ((total_lines * overlap_tokens) / total_tokens)
1009                .max(1)
1010                .min(target_lines / 2)
1011        } else {
1012            0
1013        };
1014
1015        let mut chunks = Vec::new();
1016        let mut current_start = 0usize;
1017        let mut part_num = 1u32;
1018
1019        // Parent ID for linking parts
1020        let parent_hash = hash_content(content);
1021
1022        while current_start < total_lines {
1023            // Calculate content boundaries
1024            // For parts after the first, include overlap from the previous chunk
1025            let content_start = if part_num > 1 && overlap_lines > 0 {
1026                current_start.saturating_sub(overlap_lines)
1027            } else {
1028                current_start
1029            };
1030            let content_end = (current_start + target_lines).min(total_lines);
1031
1032            let part_content = lines[content_start..content_end].join("\n");
1033
1034            let tokens = self.tokenizer.count(&part_content, token_model);
1035
1036            // Only create chunk if above minimum
1037            if tokens >= self.settings.min_tokens {
1038                let hash = hash_content(&part_content);
1039                let part_keywords = extract_keywords(&part_content);
1040                let part_identifiers = extract_identifiers(&part_content, lang_enum);
1041                let part_prefix =
1042                    Some(generate_context_prefix(file, Some(&symbol.name), &symbol.kind));
1043
1044                // Track actual overlap lines included (for metadata)
1045                let actual_overlap = if part_num > 1 {
1046                    current_start.saturating_sub(content_start) as u32
1047                } else {
1048                    0
1049                };
1050
1051                let part_source = ChunkSource {
1052                    repo: self.repo_id.clone(),
1053                    file: file.to_owned(),
1054                    lines: (base_line + content_start as u32, base_line + content_end as u32 - 1),
1055                    symbol: format!("{}_part{}", symbol.name, part_num),
1056                    fqn: None,
1057                    language: language.to_owned(),
1058                    parent: Some(symbol.name.clone()),
1059                    visibility: symbol.visibility.into(),
1060                    is_test: false,
1061                    module_path: Some(derive_module_path(file, language)),
1062                    parent_chunk_id: None,
1063                };
1064                let mut part_context = ChunkContext {
1065                    signature: symbol.signature.clone(), // Include in every part for context
1066                    // Propagate docstring to ALL parts for better RAG retrieval
1067                    // Each part should be self-contained for semantic search
1068                    docstring: symbol.docstring.clone(),
1069                    keywords: part_keywords,
1070                    identifiers: part_identifiers,
1071                    context_prefix: part_prefix,
1072                    ..Default::default()
1073                };
1074                part_context.summary =
1075                    generate_summary(ChunkKind::FunctionPart, &part_source, &part_context);
1076
1077                chunks.push(EmbedChunk {
1078                    id: hash.short_id,
1079                    full_hash: hash.full_hash,
1080                    content: part_content,
1081                    tokens,
1082                    kind: ChunkKind::FunctionPart, // or ClassPart based on symbol.kind
1083                    source: part_source,
1084                    context: part_context,
1085                    children_ids: Vec::new(),
1086                    repr: default_repr(),
1087                    code_chunk_id: None,
1088                    part: Some(ChunkPart {
1089                        part: part_num,
1090                        of: 0, // Updated after all parts
1091                        parent_id: parent_hash.short_id.clone(),
1092                        parent_signature: symbol.signature.clone().unwrap_or_default(),
1093                        overlap_lines: actual_overlap,
1094                    }),
1095                });
1096
1097                part_num += 1;
1098            }
1099
1100            current_start = content_end;
1101        }
1102
1103        // Update total part count
1104        let total_parts = chunks.len() as u32;
1105        for chunk in &mut chunks {
1106            if let Some(ref mut part) = chunk.part {
1107                part.of = total_parts;
1108            }
1109        }
1110
1111        Ok(chunks)
1112    }
1113
1114    /// Extract top-level code (code outside symbols)
1115    fn extract_top_level(
1116        &self,
1117        lines: &[&str],
1118        symbols: &[Symbol],
1119        file: &str,
1120        language: &str,
1121        lang_enum: Option<Language>,
1122    ) -> Option<EmbedChunk> {
1123        if lines.is_empty() || symbols.is_empty() {
1124            return None;
1125        }
1126
1127        // Find lines not covered by any symbol
1128        let mut covered = vec![false; lines.len()];
1129        for symbol in symbols {
1130            let start = symbol.start_line.saturating_sub(1) as usize;
1131            let end = (symbol.end_line as usize).min(lines.len());
1132            for i in start..end {
1133                covered[i] = true;
1134            }
1135        }
1136
1137        // Collect uncovered lines
1138        let top_level_lines: Vec<&str> = lines
1139            .iter()
1140            .enumerate()
1141            .filter(|(i, _)| !covered[*i])
1142            .map(|(_, line)| *line)
1143            .collect();
1144
1145        if top_level_lines.is_empty() {
1146            return None;
1147        }
1148
1149        let content = top_level_lines.join("\n").trim().to_owned();
1150        if content.is_empty() {
1151            return None;
1152        }
1153
1154        let token_model = self.parse_token_model(&self.settings.token_model);
1155        let tokens = self.tokenizer.count(&content, token_model);
1156
1157        if tokens < self.settings.min_tokens {
1158            return None;
1159        }
1160
1161        let hash = hash_content(&content);
1162        let keywords = extract_keywords(&content);
1163        let top_identifiers = extract_identifiers(&content, lang_enum);
1164        let context_prefix =
1165            Some(generate_context_prefix(file, None, &crate::types::SymbolKind::Module));
1166
1167        let top_source = ChunkSource {
1168            repo: self.repo_id.clone(),
1169            file: file.to_owned(),
1170            lines: (1, lines.len() as u32),
1171            symbol: "<top_level>".to_owned(),
1172            fqn: None,
1173            language: language.to_owned(),
1174            parent: None,
1175            visibility: Visibility::Public,
1176            is_test: false,
1177            module_path: Some(derive_module_path(file, language)),
1178            parent_chunk_id: None,
1179        };
1180        let mut top_context = ChunkContext {
1181            keywords,
1182            identifiers: top_identifiers,
1183            context_prefix,
1184            ..Default::default()
1185        };
1186        top_context.summary = generate_summary(ChunkKind::TopLevel, &top_source, &top_context);
1187
1188        Some(EmbedChunk {
1189            id: hash.short_id,
1190            full_hash: hash.full_hash,
1191            content,
1192            tokens,
1193            kind: ChunkKind::TopLevel,
1194            source: top_source,
1195            context: top_context,
1196            children_ids: Vec::new(),
1197            repr: default_repr(),
1198            code_chunk_id: None,
1199            part: None,
1200        })
1201    }
1202
1203    /// Extract semantic context for retrieval
1204    fn extract_context(
1205        &self,
1206        symbol: &Symbol,
1207        content: &str,
1208        file_path: &str,
1209        source_path: &Path,
1210    ) -> ChunkContext {
1211        // Detect language for type extraction and complexity scoring
1212        let lang = source_path
1213            .extension()
1214            .and_then(|e| e.to_str())
1215            .and_then(Language::from_extension);
1216
1217        // Extract type information via Tree-sitter if this is a function/method
1218        let (type_signature, parameter_types, return_type, error_types) = if matches!(
1219            symbol.kind,
1220            crate::types::SymbolKind::Function | crate::types::SymbolKind::Method
1221        ) {
1222            if let Some(lang) = lang {
1223                if let Some(type_info) = type_extraction::extract_types(content, lang) {
1224                    (
1225                        type_info.type_signature,
1226                        type_info.parameter_types,
1227                        type_info.return_type,
1228                        type_info.error_types,
1229                    )
1230                } else {
1231                    (None, Vec::new(), None, Vec::new())
1232                }
1233            } else {
1234                (None, Vec::new(), None, Vec::new())
1235            }
1236        } else {
1237            (None, Vec::new(), None, Vec::new())
1238        };
1239
1240        ChunkContext {
1241            docstring: symbol.docstring.clone(),
1242            comments: Vec::new(), // TODO: Extract inline comments
1243            signature: symbol.signature.clone(),
1244            calls: symbol.calls.clone(),
1245            called_by: Vec::new(), // Populated in populate_called_by pass
1246            imports: Vec::new(),   // Populated from file-level
1247            tags: self.generate_tags(symbol),
1248            keywords: extract_keywords(content),
1249            context_prefix: Some(generate_context_prefix(
1250                file_path,
1251                symbol.parent.as_deref(),
1252                &symbol.kind,
1253            )),
1254            summary: None,                // Populated after source is built
1255            qualified_calls: Vec::new(),  // Populated by ImportResolver
1256            unresolved_calls: Vec::new(), // Populated by ImportResolver
1257            identifiers: extract_identifiers(content, lang),
1258            type_signature,
1259            parameter_types,
1260            return_type,
1261            error_types,
1262            lines_of_code: self.count_lines_of_code(content),
1263            max_nesting_depth: self.calculate_nesting_depth(content),
1264            git: None, // Populated later by enrich_with_git_metadata if enabled
1265            complexity_score: lang.and_then(|l| super::complexity::compute_complexity(content, l)),
1266            dependents_count: None,
1267        }
1268    }
1269
1270    /// Count lines of code (excluding blank lines and simple comments)
1271    fn count_lines_of_code(&self, content: &str) -> u32 {
1272        content
1273            .lines()
1274            .filter(|line| {
1275                let trimmed = line.trim();
1276                // Skip blank lines and pure comment lines
1277                !trimmed.is_empty()
1278                    && !trimmed.starts_with("//")
1279                    && !trimmed.starts_with('#')
1280                    && !trimmed.starts_with("/*")
1281                    && !trimmed.starts_with('*')
1282            })
1283            .count() as u32
1284    }
1285
1286    /// Calculate maximum nesting depth based on brace/indent patterns
1287    ///
1288    /// For brace-based languages (Rust, JS, Go, etc.): counts {}, (), [] nesting
1289    /// For indentation-based languages (Python, Haskell): counts indent levels
1290    fn calculate_nesting_depth(&self, content: &str) -> u32 {
1291        // First try brace-based nesting
1292        let brace_depth = self.calculate_brace_depth(content);
1293
1294        // If no braces found (or very few), calculate indentation-based depth
1295        // This handles Python, Haskell, and other whitespace-sensitive languages
1296        if brace_depth <= 1 {
1297            let indent_depth = self.calculate_indent_depth(content);
1298            // Use the larger of the two (some Python code also uses brackets)
1299            brace_depth.max(indent_depth)
1300        } else {
1301            brace_depth
1302        }
1303    }
1304
1305    /// Calculate nesting depth based on brace pairs
1306    fn calculate_brace_depth(&self, content: &str) -> u32 {
1307        let mut max_depth = 0u32;
1308        let mut current_depth = 0i32;
1309
1310        for ch in content.chars() {
1311            match ch {
1312                '{' | '(' | '[' => {
1313                    current_depth += 1;
1314                    max_depth = max_depth.max(current_depth as u32);
1315                },
1316                '}' | ')' | ']' => {
1317                    current_depth = (current_depth - 1).max(0);
1318                },
1319                _ => {},
1320            }
1321        }
1322
1323        max_depth
1324    }
1325
1326    /// Calculate nesting depth based on indentation levels
1327    /// Used for Python, Haskell, and other whitespace-sensitive languages
1328    fn calculate_indent_depth(&self, content: &str) -> u32 {
1329        let mut max_depth = 0u32;
1330        let mut base_indent: Option<usize> = None;
1331
1332        for line in content.lines() {
1333            // Skip empty lines and comment-only lines
1334            let trimmed = line.trim();
1335            if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with("--") {
1336                continue;
1337            }
1338
1339            // Count leading whitespace (spaces or tabs)
1340            let leading_spaces = line.len() - line.trim_start().len();
1341
1342            // Set base indent from first non-empty line
1343            if base_indent.is_none() {
1344                base_indent = Some(leading_spaces);
1345            }
1346
1347            // Calculate relative depth (assuming 4-space or 1-tab = 1 level)
1348            let base = base_indent.unwrap_or(0);
1349            if leading_spaces >= base {
1350                let relative_indent = leading_spaces - base;
1351                // Normalize: assume 4 spaces or 1 tab per level
1352                let depth = (relative_indent / 4).max(relative_indent / 2) as u32;
1353                max_depth = max_depth.max(depth + 1); // +1 because base level is 1
1354            }
1355        }
1356
1357        max_depth
1358    }
1359
1360    /// Auto-generate semantic tags for improved RAG retrieval
1361    ///
1362    /// Tags are generated based on symbol names, signatures, and common patterns.
1363    /// These help with semantic search and filtering in vector databases.
1364    fn generate_tags(&self, symbol: &Symbol) -> Vec<String> {
1365        generate_tags_for_symbol(&symbol.name, symbol.signature.as_deref())
1366    }
1367
1368    /// Compute fully qualified name for a symbol
1369    ///
1370    /// Format: `file_path::parent::symbol_name`
1371    /// - file_path: Relative path with extension stripped and slashes replaced with ::
1372    /// - parent: Parent symbol name if any (e.g., class for a method)
1373    /// - symbol_name: The symbol's own name
1374    fn compute_fqn(&self, file: &str, symbol: &Symbol) -> String {
1375        // Convert file path to module-like format: src/auth/login.rs -> src::auth::login
1376        let module_path = file
1377            .strip_suffix(".rs")
1378            .or_else(|| file.strip_suffix(".py"))
1379            .or_else(|| file.strip_suffix(".ts"))
1380            .or_else(|| file.strip_suffix(".tsx"))
1381            .or_else(|| file.strip_suffix(".js"))
1382            .or_else(|| file.strip_suffix(".jsx"))
1383            .or_else(|| file.strip_suffix(".go"))
1384            .or_else(|| file.strip_suffix(".java"))
1385            .or_else(|| file.strip_suffix(".c"))
1386            .or_else(|| file.strip_suffix(".cpp"))
1387            .or_else(|| file.strip_suffix(".h"))
1388            .or_else(|| file.strip_suffix(".hpp"))
1389            .or_else(|| file.strip_suffix(".rb"))
1390            .or_else(|| file.strip_suffix(".php"))
1391            .or_else(|| file.strip_suffix(".cs"))
1392            .or_else(|| file.strip_suffix(".swift"))
1393            .or_else(|| file.strip_suffix(".kt"))
1394            .or_else(|| file.strip_suffix(".scala"))
1395            .unwrap_or(file)
1396            .replace(['\\', '/'], "::"); // Normalize path separators
1397
1398        // Build the symbol portion
1399        let symbol_part = if let Some(ref parent) = symbol.parent {
1400            format!("{}::{}::{}", module_path, parent, symbol.name)
1401        } else {
1402            format!("{}::{}", module_path, symbol.name)
1403        };
1404
1405        // Prepend repo identity: "{namespace}/{name}::{symbol_part}" or "{name}::{symbol_part}"
1406        let repo_prefix = self.repo_id.qualified_name();
1407        if repo_prefix.is_empty() {
1408            symbol_part
1409        } else {
1410            format!("{}::{}", repo_prefix, symbol_part)
1411        }
1412    }
1413
1414    /// Detect if code is test code
1415    fn is_test_code(&self, path: &Path, symbol: &Symbol) -> bool {
1416        let path_str = path.to_string_lossy().to_lowercase();
1417
1418        // Path-based detection
1419        if path_str.contains("test") || path_str.contains("spec") || path_str.contains("__tests__")
1420        {
1421            return true;
1422        }
1423
1424        // Symbol-based detection
1425        let name = symbol.name.to_lowercase();
1426        if name.starts_with("test_") || name.ends_with("_test") || name.contains("_test_") {
1427            return true;
1428        }
1429
1430        false
1431    }
1432
1433    /// Validate repository path
1434    fn validate_repo_path(&self, path: &Path) -> Result<PathBuf, EmbedError> {
1435        let canonical = path
1436            .canonicalize()
1437            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
1438
1439        // Ensure it's a directory
1440        if !canonical.is_dir() {
1441            return Err(EmbedError::NotADirectory { path: path.to_path_buf() });
1442        }
1443
1444        Ok(canonical)
1445    }
1446
1447    /// Get safe relative path, validate no traversal
1448    fn safe_relative_path(&self, path: &Path, repo_root: &Path) -> Result<String, EmbedError> {
1449        let canonical = path
1450            .canonicalize()
1451            .map_err(|e| EmbedError::IoError { path: path.to_path_buf(), source: e })?;
1452
1453        // Ensure path is within repo root
1454        if !canonical.starts_with(repo_root) {
1455            return Err(EmbedError::PathTraversal {
1456                path: canonical,
1457                repo_root: repo_root.to_path_buf(),
1458            });
1459        }
1460
1461        // Return relative path with forward slashes (cross-platform)
1462        Ok(canonical
1463            .strip_prefix(repo_root)
1464            .unwrap_or(&canonical)
1465            .to_string_lossy()
1466            .replace('\\', "/"))
1467    }
1468
1469    /// Discover all files in repository
1470    fn discover_files(&self, repo_root: &Path) -> Result<Vec<PathBuf>, EmbedError> {
1471        use glob::Pattern;
1472        use ignore::WalkBuilder;
1473
1474        let mut files = Vec::new();
1475
1476        // Compile and validate include patterns (fail fast on invalid patterns)
1477        let mut include_patterns = Vec::new();
1478        for pattern_str in &self.settings.include_patterns {
1479            match Pattern::new(pattern_str) {
1480                Ok(pattern) => include_patterns.push(pattern),
1481                Err(e) => {
1482                    return Err(EmbedError::InvalidPattern {
1483                        pattern: pattern_str.clone(),
1484                        reason: e.to_string(),
1485                    });
1486                },
1487            }
1488        }
1489
1490        // Compile and validate exclude patterns (fail fast on invalid patterns)
1491        let mut exclude_patterns = Vec::new();
1492        for pattern_str in &self.settings.exclude_patterns {
1493            match Pattern::new(pattern_str) {
1494                Ok(pattern) => exclude_patterns.push(pattern),
1495                Err(e) => {
1496                    return Err(EmbedError::InvalidPattern {
1497                        pattern: pattern_str.clone(),
1498                        reason: e.to_string(),
1499                    });
1500                },
1501            }
1502        }
1503
1504        let walker = WalkBuilder::new(repo_root)
1505            .hidden(false) // Include hidden files
1506            .git_ignore(true) // Respect .gitignore
1507            .git_global(true)
1508            .git_exclude(true)
1509            .follow_links(false) // Security: Don't follow symlinks to prevent escaping repo
1510            .build();
1511
1512        for entry in walker {
1513            let entry = entry.map_err(|e| EmbedError::IoError {
1514                path: repo_root.to_path_buf(),
1515                source: std::io::Error::other(e.to_string()),
1516            })?;
1517
1518            let path = entry.path();
1519
1520            // Only process files
1521            if !path.is_file() {
1522                continue;
1523            }
1524
1525            // Get relative path for pattern matching
1526            let relative_path = path
1527                .strip_prefix(repo_root)
1528                .unwrap_or(path)
1529                .to_string_lossy();
1530
1531            // Check include patterns (if any, file must match at least one)
1532            if !include_patterns.is_empty()
1533                && !include_patterns.iter().any(|p| p.matches(&relative_path))
1534            {
1535                continue;
1536            }
1537
1538            // Check exclude patterns (if any match, skip file)
1539            if exclude_patterns.iter().any(|p| p.matches(&relative_path)) {
1540                continue;
1541            }
1542
1543            // Skip test files unless include_tests is true
1544            if !self.settings.include_tests && self.is_test_file(path) {
1545                continue;
1546            }
1547
1548            // Only process supported languages
1549            let ext = match path.extension().and_then(|e| e.to_str()) {
1550                Some(e) => e,
1551                None => continue,
1552            };
1553            if Language::from_extension(ext).is_none() {
1554                continue;
1555            }
1556
1557            files.push(path.to_path_buf());
1558        }
1559
1560        Ok(files)
1561    }
1562
1563    /// Check if a file is a test file based on path patterns
1564    fn is_test_file(&self, path: &Path) -> bool {
1565        let path_str = path.to_string_lossy().to_lowercase();
1566
1567        // Common test directory patterns (handle both Unix and Windows separators)
1568        if path_str.contains("/tests/")
1569            || path_str.contains("\\tests\\")
1570            || path_str.contains("/test/")
1571            || path_str.contains("\\test\\")
1572            || path_str.contains("/__tests__/")
1573            || path_str.contains("\\__tests__\\")
1574            || path_str.contains("/spec/")
1575            || path_str.contains("\\spec\\")
1576        {
1577            return true;
1578        }
1579
1580        // Common test file patterns
1581        let filename = path
1582            .file_name()
1583            .and_then(|n| n.to_str())
1584            .unwrap_or("")
1585            .to_lowercase();
1586
1587        filename.starts_with("test_")
1588            || filename.ends_with("_test.rs")
1589            || filename.ends_with("_test.py")
1590            || filename.ends_with("_test.go")
1591            || filename.ends_with(".test.ts")
1592            || filename.ends_with(".test.js")
1593            || filename.ends_with(".test.tsx")
1594            || filename.ends_with(".test.jsx")
1595            || filename.ends_with(".spec.ts")
1596            || filename.ends_with(".spec.js")
1597            || filename.ends_with("_spec.rb")
1598    }
1599
1600    /// Detect language from file path
1601    fn detect_language(&self, path: &Path) -> String {
1602        path.extension()
1603            .and_then(|e| e.to_str())
1604            .and_then(Language::from_extension)
1605            .map_or_else(|| "unknown".to_owned(), |l| l.display_name().to_owned())
1606    }
1607
1608    /// Detect the Language enum for a file path (returns None for unsupported extensions)
1609    fn detect_language_enum(&self, path: &Path) -> Option<Language> {
1610        path.extension()
1611            .and_then(|e| e.to_str())
1612            .and_then(Language::from_extension)
1613    }
1614
1615    /// Parse token model string
1616    fn parse_token_model(&self, model: &str) -> TokenModel {
1617        TokenModel::from_model_name(model).unwrap_or(TokenModel::Claude)
1618    }
1619}
1620
1621/// Extract top keywords from chunk content for BM25/sparse retrieval.
1622///
1623/// Splits content on non-alphanumeric boundaries, splits identifiers by
1624/// camelCase/snake_case, filters stopwords and short tokens, then returns
1625/// the top 10 by frequency.
1626pub(crate) fn extract_keywords(content: &str) -> Vec<String> {
1627    use std::collections::BTreeMap;
1628
1629    const STOPWORDS: &[&str] = &[
1630        "the", "a", "an", "and", "or", "not", "is", "are", "was", "were", "be", "been", "being",
1631        "have", "has", "had", "do", "does", "did", "will", "would", "could", "should", "may",
1632        "might", "shall", "can", "need", "must", "let", "var", "const", "mut", "pub", "fn", "def",
1633        "class", "struct", "enum", "impl", "trait", "use", "import", "from", "return", "if",
1634        "else", "for", "while", "loop", "match", "true", "false", "none", "null", "self", "this",
1635        "super", "new", "type", "static", "async", "await", "try", "catch", "throw", "throws",
1636        "void", "int", "str", "string", "bool", "float", "double", "char", "byte",
1637    ];
1638
1639    let mut freq: BTreeMap<String, usize> = BTreeMap::new();
1640
1641    for token in content.split(|c: char| !c.is_alphanumeric() && c != '_') {
1642        let sub_tokens = split_identifier(token);
1643        for sub in &sub_tokens {
1644            let lower = sub.to_lowercase();
1645            if lower.len() >= 3 && !STOPWORDS.contains(&lower.as_str()) {
1646                *freq.entry(lower).or_insert(0) += 1;
1647            }
1648        }
1649    }
1650
1651    let mut entries: Vec<(String, usize)> = freq.into_iter().collect();
1652    entries.sort_by(|a, b| b.1.cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
1653    entries.into_iter().take(10).map(|(word, _)| word).collect()
1654}
1655
1656/// Generate a context prefix describing where the chunk fits in the codebase.
1657///
1658/// Examples:
1659/// - "From src/auth.rs, function"
1660/// - "From src/models/user.rs, in UserService, method"
1661pub(crate) fn generate_context_prefix(
1662    file_path: &str,
1663    parent: Option<&str>,
1664    kind: &crate::types::SymbolKind,
1665) -> String {
1666    let kind_name = match kind {
1667        crate::types::SymbolKind::Function => "function",
1668        crate::types::SymbolKind::Method => "method",
1669        crate::types::SymbolKind::Class => "class",
1670        crate::types::SymbolKind::Struct => "struct",
1671        crate::types::SymbolKind::Enum => "enum",
1672        crate::types::SymbolKind::Interface => "interface",
1673        crate::types::SymbolKind::Trait => "trait",
1674        crate::types::SymbolKind::Import => "import",
1675        crate::types::SymbolKind::Constant => "constant",
1676        crate::types::SymbolKind::Variable => "variable",
1677        crate::types::SymbolKind::TypeAlias => "type",
1678        crate::types::SymbolKind::Export => "export",
1679        crate::types::SymbolKind::Module => "module",
1680        crate::types::SymbolKind::Macro => "macro",
1681    };
1682
1683    match parent {
1684        Some(p) => format!("From {file_path}, in {p}, {kind_name}"),
1685        None => format!("From {file_path}, {kind_name}"),
1686    }
1687}
1688
1689/// Generate semantic tags from symbol name and signature.
1690///
1691/// Standalone function so it can be used from both `EmbedChunker` and `ChunkStream`.
1692pub(crate) fn generate_tags_for_symbol(name: &str, sig: Option<&str>) -> Vec<String> {
1693    let mut tags = Vec::new();
1694    let signature = sig.unwrap_or("");
1695    let name_lower = name.to_lowercase();
1696
1697    if signature.contains("async") || signature.contains("await") || signature.contains("suspend") {
1698        tags.push("async".to_owned());
1699    }
1700    if name_lower.contains("thread")
1701        || name_lower.contains("mutex")
1702        || name_lower.contains("lock")
1703        || name_lower.contains("spawn")
1704        || name_lower.contains("parallel")
1705        || name_lower.contains("goroutine")
1706        || name_lower.contains("channel")
1707        || signature.contains("Mutex")
1708        || signature.contains("RwLock")
1709        || signature.contains("Arc")
1710        || signature.contains("chan ")
1711        || signature.contains("<-chan")
1712        || signature.contains("chan<-")
1713        || signature.contains("sync.")
1714        || signature.contains("WaitGroup")
1715    {
1716        tags.push("concurrency".to_owned());
1717    }
1718    if name_lower.contains("password")
1719        || name_lower.contains("token")
1720        || name_lower.contains("secret")
1721        || name_lower.contains("auth")
1722        || name_lower.contains("crypt")
1723        || name_lower.contains("hash")
1724        || name_lower.contains("permission")
1725        || signature.contains("password")
1726        || signature.contains("token")
1727        || signature.contains("secret")
1728    {
1729        tags.push("security".to_owned());
1730    }
1731    if signature.contains("Error")
1732        || signature.contains("Result")
1733        || name_lower.contains("error")
1734        || name_lower.contains("exception")
1735        || name_lower.contains("panic")
1736        || name_lower.contains("unwrap")
1737    {
1738        tags.push("error-handling".to_owned());
1739    }
1740    if name_lower.contains("query")
1741        || name_lower.contains("sql")
1742        || name_lower.contains("database")
1743        || name_lower.contains("db_")
1744        || name_lower.starts_with("db")
1745        || name_lower.contains("repository")
1746        || name_lower.contains("transaction")
1747    {
1748        tags.push("database".to_owned());
1749    }
1750    if name_lower.contains("http")
1751        || name_lower.contains("request")
1752        || name_lower.contains("response")
1753        || name_lower.contains("endpoint")
1754        || name_lower.contains("route")
1755        || name_lower.contains("handler")
1756        || name_lower.contains("middleware")
1757    {
1758        tags.push("http".to_owned());
1759    }
1760    if name_lower.contains("command")
1761        || name_lower.contains("cli")
1762        || name_lower.contains("arg")
1763        || name_lower.contains("flag")
1764        || name_lower.contains("option")
1765        || name_lower.contains("subcommand")
1766    {
1767        tags.push("cli".to_owned());
1768    }
1769    if name_lower.contains("config")
1770        || name_lower.contains("setting")
1771        || name_lower.contains("preference")
1772        || name_lower.contains("option")
1773        || name_lower.contains("env")
1774    {
1775        tags.push("config".to_owned());
1776    }
1777    if name_lower.contains("log")
1778        || name_lower.contains("trace")
1779        || name_lower.contains("debug")
1780        || name_lower.contains("warn")
1781        || name_lower.contains("info")
1782        || name_lower.contains("metric")
1783    {
1784        tags.push("logging".to_owned());
1785    }
1786    if name_lower.contains("cache")
1787        || name_lower.contains("memoize")
1788        || name_lower.contains("invalidate")
1789    {
1790        tags.push("cache".to_owned());
1791    }
1792    if name_lower.contains("valid")
1793        || name_lower.contains("check")
1794        || name_lower.contains("verify")
1795        || name_lower.contains("assert")
1796        || name_lower.contains("sanitize")
1797    {
1798        tags.push("validation".to_owned());
1799    }
1800    if name_lower.contains("serial")
1801        || name_lower.contains("deserial")
1802        || name_lower.contains("json")
1803        || name_lower.contains("xml")
1804        || name_lower.contains("yaml")
1805        || name_lower.contains("toml")
1806        || name_lower.contains("encode")
1807        || name_lower.contains("decode")
1808        || name_lower.contains("parse")
1809        || name_lower.contains("format")
1810    {
1811        tags.push("serialization".to_owned());
1812    }
1813    if name_lower.contains("file")
1814        || name_lower.contains("read")
1815        || name_lower.contains("write")
1816        || name_lower.contains("path")
1817        || name_lower.contains("dir")
1818        || name_lower.contains("fs")
1819        || name_lower.contains("io")
1820    {
1821        tags.push("io".to_owned());
1822    }
1823    if name_lower.contains("socket")
1824        || name_lower.contains("connect")
1825        || name_lower.contains("network")
1826        || name_lower.contains("tcp")
1827        || name_lower.contains("udp")
1828        || name_lower.contains("client")
1829        || name_lower.contains("server")
1830    {
1831        tags.push("network".to_owned());
1832    }
1833    if name_lower == "new"
1834        || name_lower == "init"
1835        || name_lower == "setup"
1836        || name_lower == "create"
1837        || name_lower.starts_with("new_")
1838        || name_lower.starts_with("init_")
1839        || name_lower.starts_with("create_")
1840        || name_lower.ends_with("_new")
1841    {
1842        tags.push("init".to_owned());
1843    }
1844    if name_lower.contains("cleanup")
1845        || name_lower.contains("teardown")
1846        || name_lower.contains("close")
1847        || name_lower.contains("dispose")
1848        || name_lower.contains("shutdown")
1849        || name_lower == "drop"
1850    {
1851        tags.push("cleanup".to_owned());
1852    }
1853    if name.starts_with("test_")
1854        || name.ends_with("_test")
1855        || name.contains("Test")
1856        || name_lower.contains("mock")
1857        || name_lower.contains("stub")
1858        || name_lower.contains("fixture")
1859    {
1860        tags.push("test".to_owned());
1861    }
1862    if signature.contains("deprecated") || signature.contains("Deprecated") {
1863        tags.push("deprecated".to_owned());
1864    }
1865    if signature.starts_with("pub fn")
1866        || signature.starts_with("pub async fn")
1867        || signature.starts_with("export")
1868    {
1869        tags.push("public-api".to_owned());
1870    }
1871    if name_lower.contains("model")
1872        || name_lower.contains("train")
1873        || name_lower.contains("predict")
1874        || name_lower.contains("inference")
1875        || name_lower.contains("neural")
1876        || name_lower.contains("embedding")
1877        || name_lower.contains("classifier")
1878        || name_lower.contains("regressor")
1879        || name_lower.contains("optimizer")
1880        || name_lower.contains("loss")
1881        || name_lower.contains("gradient")
1882        || name_lower.contains("backprop")
1883        || name_lower.contains("forward")
1884        || name_lower.contains("layer")
1885        || name_lower.contains("activation")
1886        || name_lower.contains("weight")
1887        || name_lower.contains("bias")
1888        || name_lower.contains("epoch")
1889        || name_lower.contains("batch")
1890        || signature.contains("torch")
1891        || signature.contains("tensorflow")
1892        || signature.contains("keras")
1893        || signature.contains("sklearn")
1894        || signature.contains("nn.")
1895        || signature.contains("nn::")
1896    {
1897        tags.push("ml".to_owned());
1898    }
1899    if name_lower.contains("dataframe")
1900        || name_lower.contains("dataset")
1901        || name_lower.contains("tensor")
1902        || name_lower.contains("numpy")
1903        || name_lower.contains("pandas")
1904        || name_lower.contains("array")
1905        || name_lower.contains("matrix")
1906        || name_lower.contains("vector")
1907        || name_lower.contains("feature")
1908        || name_lower.contains("preprocess")
1909        || name_lower.contains("normalize")
1910        || name_lower.contains("transform")
1911        || name_lower.contains("pipeline")
1912        || name_lower.contains("etl")
1913        || name_lower.contains("aggregate")
1914        || name_lower.contains("groupby")
1915        || name_lower.contains("pivot")
1916        || signature.contains("pd.")
1917        || signature.contains("np.")
1918        || signature.contains("DataFrame")
1919        || signature.contains("ndarray")
1920    {
1921        tags.push("data-science".to_owned());
1922    }
1923
1924    tags
1925}
1926
1927/// Generate a natural language summary for a chunk.
1928///
1929/// Priority:
1930/// 1. First line of docstring (if available and under ~400 chars)
1931/// 2. Heuristic template based on kind, visibility, symbol name, file path, and signature
1932/// 3. `None` for import chunks
1933///
1934/// The summary is designed for semantic search — it includes key information
1935/// about what the symbol is and where it lives.
1936pub(crate) fn generate_summary(
1937    kind: ChunkKind,
1938    source: &ChunkSource,
1939    context: &ChunkContext,
1940) -> Option<String> {
1941    // Imports: no summary
1942    if kind == ChunkKind::Imports {
1943        return None;
1944    }
1945
1946    // Priority 1: Use first line of docstring if available
1947    if let Some(ref docstring) = context.docstring {
1948        let cleaned = strip_doc_markers(docstring);
1949        if !cleaned.is_empty() && cleaned.len() <= 400 {
1950            return Some(cleaned);
1951        }
1952        // If docstring is too long, extract just the first sentence/line
1953        if !cleaned.is_empty() {
1954            let first_line = extract_first_sentence(&cleaned);
1955            if !first_line.is_empty() {
1956                return Some(first_line);
1957            }
1958        }
1959    }
1960
1961    // Priority 2: Heuristic template
1962    let file_module = file_path_to_module(&source.file);
1963
1964    match kind {
1965        ChunkKind::TopLevel => {
1966            return Some(format!("Top-level code in {}", source.file));
1967        },
1968        ChunkKind::Imports => return None,
1969        _ => {},
1970    }
1971
1972    let visibility_prefix = format_visibility(source.visibility);
1973    let kind_label = kind.name();
1974    let symbol = &source.symbol;
1975
1976    match kind {
1977        ChunkKind::Function | ChunkKind::Method | ChunkKind::FunctionPart => {
1978            let sig_part = context
1979                .signature
1980                .as_deref()
1981                .map(|s| format!(" -- {}", truncate_signature(s, 200)))
1982                .unwrap_or_default();
1983            Some(format!(
1984                "{}{} '{}' in {}{}",
1985                visibility_prefix, kind_label, symbol, file_module, sig_part
1986            ))
1987        },
1988        ChunkKind::Class | ChunkKind::Struct | ChunkKind::ClassPart => {
1989            Some(format!("{}{} '{}' in {}", visibility_prefix, kind_label, symbol, file_module))
1990        },
1991        ChunkKind::Enum => {
1992            Some(format!("{}enum '{}' in {}", visibility_prefix, symbol, file_module))
1993        },
1994        ChunkKind::Interface | ChunkKind::Trait => {
1995            Some(format!("{}{} '{}' in {}", visibility_prefix, kind_label, symbol, file_module))
1996        },
1997        ChunkKind::Constant | ChunkKind::Variable => {
1998            Some(format!("{}{} '{}' in {}", visibility_prefix, kind_label, symbol, file_module))
1999        },
2000        ChunkKind::Module => {
2001            Some(format!("{}module '{}' in {}", visibility_prefix, symbol, file_module))
2002        },
2003        _ => None,
2004    }
2005}
2006
2007/// Strip common doc-comment markers from a docstring.
2008///
2009/// Handles: `///`, `//!`, `/**`, `*/`, `*`, `#`, `"""`, `'''`, leading whitespace.
2010fn strip_doc_markers(docstring: &str) -> String {
2011    let first_line = docstring.lines().next().unwrap_or("");
2012    let trimmed = first_line.trim();
2013
2014    // Strip leading markers
2015    let stripped = trimmed
2016        .strip_prefix("///")
2017        .or_else(|| trimmed.strip_prefix("//!"))
2018        .or_else(|| trimmed.strip_prefix("/**"))
2019        .or_else(|| trimmed.strip_prefix("/*"))
2020        .or_else(|| trimmed.strip_prefix("*/"))
2021        .or_else(|| trimmed.strip_prefix("* "))
2022        .or_else(|| trimmed.strip_prefix('*'))
2023        .or_else(|| trimmed.strip_prefix("\"\"\""))
2024        .or_else(|| trimmed.strip_prefix("'''"))
2025        .or_else(|| trimmed.strip_prefix("# "))
2026        .or_else(|| trimmed.strip_prefix('#'))
2027        .unwrap_or(trimmed);
2028
2029    // Strip trailing markers
2030    let stripped = stripped.trim();
2031    let stripped = stripped
2032        .strip_suffix("\"\"\"")
2033        .or_else(|| stripped.strip_suffix("'''"))
2034        .or_else(|| stripped.strip_suffix("*/"))
2035        .unwrap_or(stripped);
2036
2037    stripped.trim().to_owned()
2038}
2039
2040/// Extract the first sentence from text.
2041///
2042/// A sentence ends at the first `.`, `!`, or `?` followed by whitespace or end-of-string,
2043/// or at the first newline.
2044fn extract_first_sentence(text: &str) -> String {
2045    // Take first line
2046    let first_line = text.lines().next().unwrap_or(text);
2047
2048    // Find sentence boundary
2049    let mut end = first_line.len();
2050    for (i, ch) in first_line.char_indices() {
2051        if matches!(ch, '.' | '!' | '?') {
2052            // Check if next char is whitespace or end of string
2053            let next_idx = i + ch.len_utf8();
2054            if next_idx >= first_line.len()
2055                || first_line[next_idx..].starts_with(char::is_whitespace)
2056            {
2057                end = next_idx;
2058                break;
2059            }
2060        }
2061    }
2062
2063    let result = first_line[..end].trim();
2064    if result.len() > 400 {
2065        format!("{}...", &result[..397])
2066    } else {
2067        result.to_owned()
2068    }
2069}
2070
2071/// Convert a file path to a module-like notation.
2072///
2073/// Strips common prefixes (`src/`, `lib/`, `main/`), drops file extension,
2074/// and replaces `/` with `::`.
2075///
2076/// Example: `src/auth/jwt.rs` -> `auth::jwt`
2077fn file_path_to_module(file_path: &str) -> String {
2078    let path = file_path.replace('\\', "/");
2079
2080    // Strip common prefixes
2081    let stripped = path
2082        .strip_prefix("src/")
2083        .or_else(|| path.strip_prefix("lib/"))
2084        .or_else(|| path.strip_prefix("main/"))
2085        .unwrap_or(&path);
2086
2087    // Drop file extension
2088    let without_ext = stripped.rsplit_once('.').map_or(stripped, |(base, _)| base);
2089
2090    // Replace / with ::
2091    without_ext.replace('/', "::")
2092}
2093
2094/// Format visibility for summary output.
2095///
2096/// Returns "Public ", "Private ", etc. for known visibilities,
2097/// or "" for the default (Public, which is omitted for brevity).
2098fn format_visibility(vis: Visibility) -> &'static str {
2099    match vis {
2100        Visibility::Public => "Public ",
2101        Visibility::Private => "Private ",
2102        Visibility::Protected => "Protected ",
2103        Visibility::Internal => "Internal ",
2104    }
2105}
2106
2107/// Truncate a signature to at most `max_len` characters.
2108///
2109/// If truncated, appends "..." to indicate continuation.
2110/// Also collapses to a single line.
2111fn truncate_signature(sig: &str, max_len: usize) -> String {
2112    // Collapse to single line
2113    let oneliner: String = sig
2114        .lines()
2115        .map(|l| l.trim())
2116        .filter(|l| !l.is_empty())
2117        .collect::<Vec<_>>()
2118        .join(" ");
2119
2120    if oneliner.len() <= max_len {
2121        oneliner
2122    } else {
2123        format!("{}...", &oneliner[..max_len.saturating_sub(3)])
2124    }
2125}
2126
2127/// Split an identifier into sub-tokens by camelCase, PascalCase, and snake_case boundaries.
2128///
2129/// Examples:
2130/// - "getUserName" -> ["get", "User", "Name"]
2131/// - "get_user_name" -> ["get", "user", "name"]
2132/// - "HTTPClient" -> ["HTTP", "Client"]
2133pub(crate) fn split_identifier(ident: &str) -> Vec<String> {
2134    let mut tokens = Vec::new();
2135    let mut current = String::new();
2136
2137    for ch in ident.chars() {
2138        if ch == '_' {
2139            if !current.is_empty() {
2140                tokens.push(std::mem::take(&mut current));
2141            }
2142        } else if ch.is_uppercase() && !current.is_empty() {
2143            let last_was_upper = current.chars().last().is_some_and(|c| c.is_uppercase());
2144            if !last_was_upper {
2145                // camelCase boundary: "getUser" -> ["get", "U..."]
2146                tokens.push(std::mem::take(&mut current));
2147            }
2148            current.push(ch);
2149        } else {
2150            // Transition from uppercase run to lowercase: "HTTPClient" -> ["HTTP", "Client"]
2151            if ch.is_lowercase() && current.len() > 1 && current.chars().all(|c| c.is_uppercase()) {
2152                let last = current.pop().unwrap();
2153                if !current.is_empty() {
2154                    tokens.push(std::mem::take(&mut current));
2155                }
2156                current.push(last);
2157            }
2158            current.push(ch);
2159        }
2160    }
2161
2162    if !current.is_empty() {
2163        tokens.push(current);
2164    }
2165
2166    tokens
2167}
2168
2169/// Derive a module path from a file path and language.
2170///
2171/// Converts file paths to language-idiomatic module paths:
2172/// - **Rust**: `src/auth/jwt.rs` -> `auth::jwt`, `src/lib.rs` -> root, `src/auth/mod.rs` -> `auth`
2173/// - **Python**: `src/auth/jwt.py` -> `auth.jwt`, `__init__.py` -> parent module
2174/// - **TypeScript/JavaScript**: `src/auth/jwt.ts` -> `auth/jwt`, `index.ts` -> parent
2175/// - **Java**: `src/main/java/com/foo/Bar.java` -> `com.foo`
2176/// - **Go**: `internal/auth/jwt.go` -> `auth/jwt`
2177/// - **Default**: strip `src/`/`lib/`, replace `/` with `::`, drop extension
2178pub(crate) fn derive_module_path(file_path: &str, language: &str) -> String {
2179    let path = file_path.replace('\\', "/");
2180    let lang_lower = language.to_lowercase();
2181
2182    match lang_lower.as_str() {
2183        "rust" => derive_module_path_rust(&path),
2184        "python" => derive_module_path_python(&path),
2185        "typescript" | "tsx" | "javascript" | "jsx" => derive_module_path_js(&path),
2186        "java" => derive_module_path_java(&path),
2187        "go" => derive_module_path_go(&path),
2188        _ => derive_module_path_default(&path),
2189    }
2190}
2191
2192fn derive_module_path_rust(path: &str) -> String {
2193    let mut p = path.to_owned();
2194
2195    // Strip src/ prefix
2196    if let Some(rest) = p.strip_prefix("src/") {
2197        p = rest.to_owned();
2198    }
2199
2200    // Handle special files
2201    if p == "lib.rs" || p == "main.rs" {
2202        return String::new(); // root module
2203    }
2204
2205    // Strip mod.rs -> use parent directory
2206    if let Some(rest) = p.strip_suffix("/mod.rs") {
2207        p = rest.to_owned();
2208    } else if let Some(rest) = p.strip_suffix(".rs") {
2209        p = rest.to_owned();
2210    }
2211
2212    p.replace('/', "::")
2213}
2214
2215fn derive_module_path_python(path: &str) -> String {
2216    let mut p = path.to_owned();
2217
2218    // Strip common prefixes
2219    for prefix in &["src/", "lib/"] {
2220        if let Some(rest) = p.strip_prefix(prefix) {
2221            p = rest.to_owned();
2222            break;
2223        }
2224    }
2225
2226    // Handle __init__.py -> parent module
2227    if let Some(rest) = p.strip_suffix("/__init__.py") {
2228        return rest.replace('/', ".");
2229    }
2230    if p == "__init__.py" {
2231        return String::new();
2232    }
2233
2234    // Strip .py extension
2235    if let Some(rest) = p.strip_suffix(".py") {
2236        p = rest.to_owned();
2237    }
2238
2239    p.replace('/', ".")
2240}
2241
2242fn derive_module_path_js(path: &str) -> String {
2243    let mut p = path.to_owned();
2244
2245    // Strip common prefixes
2246    for prefix in &["src/", "lib/"] {
2247        if let Some(rest) = p.strip_prefix(prefix) {
2248            p = rest.to_owned();
2249            break;
2250        }
2251    }
2252
2253    // Handle index files -> parent directory
2254    let index_suffixes = ["/index.ts", "/index.tsx", "/index.js", "/index.jsx"];
2255    for suffix in &index_suffixes {
2256        if let Some(rest) = p.strip_suffix(suffix) {
2257            return rest.to_owned();
2258        }
2259    }
2260    if p.starts_with("index.") {
2261        return String::new();
2262    }
2263
2264    // Strip extension
2265    for ext in &[".ts", ".tsx", ".js", ".jsx"] {
2266        if let Some(rest) = p.strip_suffix(ext) {
2267            return rest.to_owned();
2268        }
2269    }
2270
2271    p
2272}
2273
2274fn derive_module_path_java(path: &str) -> String {
2275    let mut p = path.to_owned();
2276
2277    // Strip src/main/java/ or src/test/java/ prefix
2278    for prefix in &["src/main/java/", "src/test/java/", "src/"] {
2279        if let Some(rest) = p.strip_prefix(prefix) {
2280            p = rest.to_owned();
2281            break;
2282        }
2283    }
2284
2285    // Strip .java extension and take parent path (package)
2286    if let Some(rest) = p.strip_suffix(".java") {
2287        p = rest.to_owned();
2288    }
2289
2290    // Get the directory part (package) — drop the class name
2291    if let Some(last_slash) = p.rfind('/') {
2292        p = p[..last_slash].to_owned();
2293    } else {
2294        return String::new(); // default package
2295    }
2296
2297    p.replace('/', ".")
2298}
2299
2300fn derive_module_path_go(path: &str) -> String {
2301    let mut p = path.to_owned();
2302
2303    // Strip common Go prefixes
2304    for prefix in &["internal/", "pkg/", "cmd/"] {
2305        if let Some(rest) = p.strip_prefix(prefix) {
2306            p = rest.to_owned();
2307            break;
2308        }
2309    }
2310
2311    // Strip .go extension
2312    if let Some(rest) = p.strip_suffix(".go") {
2313        p = rest.to_owned();
2314    }
2315
2316    // In Go, module path is the directory, not the file
2317    if let Some(last_slash) = p.rfind('/') {
2318        p[..last_slash].to_owned()
2319    } else {
2320        // Single file at root level
2321        p
2322    }
2323}
2324
2325fn derive_module_path_default(path: &str) -> String {
2326    let mut p = path.to_owned();
2327
2328    // Strip common prefixes
2329    for prefix in &["src/", "lib/"] {
2330        if let Some(rest) = p.strip_prefix(prefix) {
2331            p = rest.to_owned();
2332            break;
2333        }
2334    }
2335
2336    // Strip extension
2337    if let Some(dot_pos) = p.rfind('.') {
2338        p = p[..dot_pos].to_owned();
2339    }
2340
2341    p.replace('/', "::")
2342}
2343
2344#[cfg(test)]
2345mod tests {
2346    use super::*;
2347    use crate::embedding::progress::QuietProgress;
2348    use tempfile::TempDir;
2349
2350    fn create_test_file(dir: &Path, name: &str, content: &str) {
2351        let path = dir.join(name);
2352        if let Some(parent) = path.parent() {
2353            std::fs::create_dir_all(parent).unwrap();
2354        }
2355        std::fs::write(path, content).unwrap();
2356    }
2357
2358    #[test]
2359    fn test_chunker_creation() {
2360        let settings = EmbedSettings::default();
2361        let limits = ResourceLimits::default();
2362        let chunker = EmbedChunker::new(settings, limits);
2363        assert!(chunker.settings.max_tokens > 0);
2364    }
2365
2366    #[test]
2367    fn test_chunk_single_file() {
2368        let temp_dir = TempDir::new().unwrap();
2369        let rust_code = r#"
2370/// A test function
2371fn hello() {
2372    println!("Hello, world!");
2373}
2374
2375fn goodbye() {
2376    println!("Goodbye!");
2377}
2378"#;
2379        create_test_file(temp_dir.path(), "test.rs", rust_code);
2380
2381        let settings = EmbedSettings::default();
2382        let mut chunker = EmbedChunker::with_defaults(settings);
2383        let progress = QuietProgress;
2384
2385        let chunks = chunker
2386            .chunk_repository(temp_dir.path(), &progress)
2387            .unwrap();
2388
2389        // Should have at least 2 chunks (hello and goodbye functions)
2390        assert!(!chunks.is_empty());
2391
2392        // Check that chunks are sorted
2393        for i in 1..chunks.len() {
2394            assert!(chunks[i - 1].source.file <= chunks[i].source.file);
2395        }
2396    }
2397
2398    #[test]
2399    fn test_determinism() {
2400        let temp_dir = TempDir::new().unwrap();
2401        create_test_file(temp_dir.path(), "a.rs", "fn foo() {}");
2402        create_test_file(temp_dir.path(), "b.rs", "fn bar() {}");
2403
2404        let settings = EmbedSettings::default();
2405        let progress = QuietProgress;
2406
2407        let results: Vec<Vec<EmbedChunk>> = (0..3)
2408            .map(|_| {
2409                let mut chunker = EmbedChunker::with_defaults(settings.clone());
2410                chunker
2411                    .chunk_repository(temp_dir.path(), &progress)
2412                    .unwrap()
2413            })
2414            .collect();
2415
2416        // All runs should produce identical results
2417        for i in 1..results.len() {
2418            assert_eq!(results[0].len(), results[i].len());
2419            for j in 0..results[0].len() {
2420                assert_eq!(results[0][j].id, results[i][j].id);
2421            }
2422        }
2423    }
2424
2425    #[test]
2426    fn test_file_too_large() {
2427        let temp_dir = TempDir::new().unwrap();
2428        // Create a file larger than 100 bytes
2429        let large_content = "x".repeat(200);
2430        create_test_file(temp_dir.path(), "large.rs", &large_content);
2431
2432        let settings = EmbedSettings::default();
2433        let limits = ResourceLimits::default().with_max_file_size(100);
2434        let mut chunker = EmbedChunker::new(settings, limits);
2435        let progress = QuietProgress;
2436
2437        // Should skip the file (warning) and return empty
2438        let result = chunker.chunk_repository(temp_dir.path(), &progress);
2439
2440        // The chunker should produce an error about no chunks generated
2441        // because the only file was skipped
2442        assert!(result.is_err());
2443    }
2444
2445    #[test]
2446    fn test_empty_directory() {
2447        let temp_dir = TempDir::new().unwrap();
2448
2449        let settings = EmbedSettings::default();
2450        let mut chunker = EmbedChunker::with_defaults(settings);
2451        let progress = QuietProgress;
2452
2453        let result = chunker.chunk_repository(temp_dir.path(), &progress);
2454
2455        assert!(matches!(result, Err(EmbedError::NoChunksGenerated { .. })));
2456    }
2457
2458    #[test]
2459    fn test_language_detection() {
2460        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2461
2462        assert_eq!(chunker.detect_language(Path::new("test.rs")), "Rust");
2463        assert_eq!(chunker.detect_language(Path::new("test.py")), "Python");
2464        assert_eq!(chunker.detect_language(Path::new("test.unknown")), "unknown");
2465    }
2466
2467    #[test]
2468    fn test_is_test_code() {
2469        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2470
2471        let test_symbol = Symbol::new("test_foo", crate::types::SymbolKind::Function);
2472        assert!(chunker.is_test_code(Path::new("foo.rs"), &test_symbol));
2473
2474        let normal_symbol = Symbol::new("foo", crate::types::SymbolKind::Function);
2475        assert!(!chunker.is_test_code(Path::new("src/lib.rs"), &normal_symbol));
2476
2477        // Test path-based detection
2478        assert!(chunker.is_test_code(Path::new("tests/test_foo.rs"), &normal_symbol));
2479    }
2480
2481    #[test]
2482    fn test_generate_tags() {
2483        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2484
2485        let mut symbol = Symbol::new("authenticate_user", crate::types::SymbolKind::Function);
2486        symbol.signature = Some("async fn authenticate_user(password: &str)".to_owned());
2487
2488        let tags = chunker.generate_tags(&symbol);
2489        assert!(tags.contains(&"async".to_owned()));
2490        assert!(tags.contains(&"security".to_owned()));
2491    }
2492
2493    #[test]
2494    fn test_generate_tags_kotlin_suspend() {
2495        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2496
2497        let mut symbol = Symbol::new("fetchData", crate::types::SymbolKind::Function);
2498        symbol.signature = Some("suspend fun fetchData(): Result<Data>".to_owned());
2499
2500        let tags = chunker.generate_tags(&symbol);
2501        assert!(tags.contains(&"async".to_owned()), "Kotlin suspend should be tagged as async");
2502    }
2503
2504    #[test]
2505    fn test_generate_tags_go_concurrency() {
2506        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2507
2508        let mut symbol = Symbol::new("processMessages", crate::types::SymbolKind::Function);
2509        symbol.signature = Some("func processMessages(ch chan string)".to_owned());
2510
2511        let tags = chunker.generate_tags(&symbol);
2512        assert!(
2513            tags.contains(&"concurrency".to_owned()),
2514            "Go channels should be tagged as concurrency"
2515        );
2516    }
2517
2518    #[test]
2519    fn test_generate_tags_ml() {
2520        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2521
2522        // Test ML training function
2523        let mut symbol = Symbol::new("train_model", crate::types::SymbolKind::Function);
2524        symbol.signature = Some("def train_model(epochs: int, batch_size: int)".to_owned());
2525        let tags = chunker.generate_tags(&symbol);
2526        assert!(tags.contains(&"ml".to_owned()), "train_model should be tagged as ml");
2527
2528        // Test neural network layer
2529        let mut symbol2 = Symbol::new("forward_pass", crate::types::SymbolKind::Function);
2530        symbol2.signature = Some("def forward_pass(self, x: torch.Tensor)".to_owned());
2531        let tags2 = chunker.generate_tags(&symbol2);
2532        assert!(
2533            tags2.contains(&"ml".to_owned()),
2534            "torch.Tensor in signature should be tagged as ml"
2535        );
2536
2537        // Test classifier
2538        let mut symbol3 = Symbol::new("ImageClassifier", crate::types::SymbolKind::Class);
2539        symbol3.signature = Some("class ImageClassifier(nn.Module)".to_owned());
2540        let tags3 = chunker.generate_tags(&symbol3);
2541        assert!(tags3.contains(&"ml".to_owned()), "nn.Module should be tagged as ml");
2542    }
2543
2544    #[test]
2545    fn test_generate_tags_data_science() {
2546        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2547
2548        // Test DataFrame operation
2549        let mut symbol = Symbol::new("preprocess_dataframe", crate::types::SymbolKind::Function);
2550        symbol.signature = Some("def preprocess_dataframe(df: pd.DataFrame)".to_owned());
2551        let tags = chunker.generate_tags(&symbol);
2552        assert!(
2553            tags.contains(&"data-science".to_owned()),
2554            "DataFrame should be tagged as data-science"
2555        );
2556
2557        // Test numpy array
2558        let mut symbol2 = Symbol::new("normalize_array", crate::types::SymbolKind::Function);
2559        symbol2.signature = Some("def normalize_array(arr: np.ndarray)".to_owned());
2560        let tags2 = chunker.generate_tags(&symbol2);
2561        assert!(
2562            tags2.contains(&"data-science".to_owned()),
2563            "np.ndarray should be tagged as data-science"
2564        );
2565
2566        // Test ETL pipeline
2567        let symbol3 = Symbol::new("run_etl_pipeline", crate::types::SymbolKind::Function);
2568        let tags3 = chunker.generate_tags(&symbol3);
2569        assert!(tags3.contains(&"data-science".to_owned()), "etl should be tagged as data-science");
2570    }
2571
2572    #[test]
2573    fn test_brace_nesting_depth() {
2574        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2575
2576        // Test simple nesting
2577        let code = "fn foo() { if x { if y { } } }";
2578        assert_eq!(chunker.calculate_brace_depth(code), 3);
2579
2580        // Test no nesting
2581        let flat = "let x = 1;";
2582        assert_eq!(chunker.calculate_brace_depth(flat), 0);
2583
2584        // Test deep nesting with all bracket types
2585        let deep = "fn f() { let a = vec![HashMap::new()]; }";
2586        assert!(chunker.calculate_brace_depth(deep) >= 2);
2587    }
2588
2589    #[test]
2590    fn test_indent_nesting_depth() {
2591        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2592
2593        // Test Python-style indentation (4 spaces per level)
2594        let python_code = r#"
2595def foo():
2596    if x:
2597        if y:
2598            do_something()
2599        else:
2600            other()
2601"#;
2602        let depth = chunker.calculate_indent_depth(python_code);
2603        assert!(depth >= 3, "Should detect indentation nesting, got {}", depth);
2604
2605        // Test flat code
2606        let flat = "x = 1\ny = 2\n";
2607        assert!(chunker.calculate_indent_depth(flat) <= 1);
2608    }
2609
2610    #[test]
2611    fn test_combined_nesting_depth() {
2612        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2613
2614        // Brace-based should win for languages like Rust
2615        let rust_code = "fn foo() { if x { match y { A => {}, B => {} } } }";
2616        let depth = chunker.calculate_nesting_depth(rust_code);
2617        assert!(depth >= 3, "Should use brace depth for Rust-like code");
2618
2619        // Indent-based should win for Python-like code (few braces)
2620        let python_code = "def foo():\n    if x:\n        y()\n";
2621        let depth = chunker.calculate_nesting_depth(python_code);
2622        assert!(depth >= 1, "Should use indent depth for Python-like code");
2623    }
2624
2625    #[test]
2626    fn test_lines_of_code() {
2627        let chunker = EmbedChunker::with_defaults(EmbedSettings::default());
2628
2629        let code = r#"
2630// This is a comment
2631fn foo() {
2632    let x = 1;
2633
2634    // Another comment
2635    let y = 2;
2636}
2637"#;
2638        let loc = chunker.count_lines_of_code(code);
2639        // Should count: fn foo() {, let x = 1;, let y = 2;, }
2640        // Should skip: empty lines and comments
2641        assert!((4..=5).contains(&loc), "LOC should be ~4, got {}", loc);
2642    }
2643
2644    #[test]
2645    fn test_line_too_long_error() {
2646        let temp_dir = TempDir::new().unwrap();
2647
2648        // Create a file with a very long line (simulating minified code)
2649        let long_line = "x".repeat(50_000);
2650        let content = format!("fn foo() {{ {} }}", long_line);
2651        create_test_file(temp_dir.path(), "minified.rs", &content);
2652
2653        let settings = EmbedSettings::default();
2654        // Use strict line length limit
2655        let limits = ResourceLimits::default().with_max_line_length(10_000);
2656        let mut chunker = EmbedChunker::new(settings, limits);
2657        let progress = QuietProgress;
2658
2659        let result = chunker.chunk_repository(temp_dir.path(), &progress);
2660
2661        // Should fail due to line too long
2662        assert!(result.is_err(), "Should reject files with very long lines");
2663    }
2664
2665    #[test]
2666    fn test_hierarchical_chunking_integration() {
2667        let temp_dir = TempDir::new().unwrap();
2668
2669        // Create a Rust file with a struct that has multiple methods
2670        let rust_code = r#"
2671/// A user account
2672pub struct User {
2673    pub name: String,
2674    pub email: String,
2675}
2676
2677impl User {
2678    /// Create a new user
2679    pub fn new(name: String, email: String) -> Self {
2680        Self { name, email }
2681    }
2682
2683    /// Get the user's display name
2684    pub fn display_name(&self) -> &str {
2685        &self.name
2686    }
2687
2688    /// Validate the user's email
2689    pub fn validate_email(&self) -> bool {
2690        self.email.contains('@')
2691    }
2692}
2693"#;
2694        create_test_file(temp_dir.path(), "user.rs", rust_code);
2695
2696        // Test WITHOUT hierarchy
2697        let settings_no_hierarchy = EmbedSettings { enable_hierarchy: false, ..Default::default() };
2698        let mut chunker_no_hierarchy = EmbedChunker::with_defaults(settings_no_hierarchy);
2699        let progress = QuietProgress;
2700        let chunks_no_hierarchy = chunker_no_hierarchy
2701            .chunk_repository(temp_dir.path(), &progress)
2702            .unwrap();
2703
2704        // Test WITH hierarchy
2705        let settings_with_hierarchy = EmbedSettings {
2706            enable_hierarchy: true,
2707            hierarchy_min_children: 2,
2708            ..Default::default()
2709        };
2710        let mut chunker_with_hierarchy = EmbedChunker::with_defaults(settings_with_hierarchy);
2711        let chunks_with_hierarchy = chunker_with_hierarchy
2712            .chunk_repository(temp_dir.path(), &progress)
2713            .unwrap();
2714
2715        // Hierarchy should produce more chunks (original + summaries)
2716        assert!(
2717            chunks_with_hierarchy.len() >= chunks_no_hierarchy.len(),
2718            "Hierarchy should produce at least as many chunks: {} vs {}",
2719            chunks_with_hierarchy.len(),
2720            chunks_no_hierarchy.len()
2721        );
2722
2723        // Check for ContainerSummary chunks when hierarchy is enabled
2724        let summary_chunks: Vec<_> = chunks_with_hierarchy
2725            .iter()
2726            .filter(|c| matches!(c.kind, ChunkKind::Module)) // Summary chunks use Module kind
2727            .collect();
2728
2729        // If we have container types with enough children, we should have summaries
2730        // Note: This depends on the parser correctly identifying struct + impl methods
2731        if !summary_chunks.is_empty() {
2732            // Summary chunks should have content referencing children
2733            for summary in &summary_chunks {
2734                assert!(!summary.content.is_empty(), "Summary chunk should have content");
2735            }
2736        }
2737
2738        // Verify determinism with hierarchy enabled
2739        let chunks_with_hierarchy_2 = chunker_with_hierarchy
2740            .chunk_repository(temp_dir.path(), &progress)
2741            .unwrap();
2742        assert_eq!(
2743            chunks_with_hierarchy.len(),
2744            chunks_with_hierarchy_2.len(),
2745            "Hierarchical chunking should be deterministic"
2746        );
2747        for (c1, c2) in chunks_with_hierarchy
2748            .iter()
2749            .zip(chunks_with_hierarchy_2.iter())
2750        {
2751            assert_eq!(c1.id, c2.id, "Chunk IDs should be identical across runs");
2752        }
2753    }
2754
2755    #[test]
2756    fn test_summary_from_docstring() {
2757        let source = ChunkSource {
2758            repo: RepoIdentifier::default(),
2759            file: "src/auth/jwt.rs".to_owned(),
2760            lines: (10, 20),
2761            symbol: "verify_token".to_owned(),
2762            fqn: None,
2763            language: "Rust".to_owned(),
2764            parent: None,
2765            visibility: Visibility::Public,
2766            is_test: false,
2767            module_path: None,
2768            parent_chunk_id: None,
2769        };
2770        let context = ChunkContext {
2771            docstring: Some("/// Verify a JWT token and return the claims.".to_owned()),
2772            signature: Some("pub fn verify_token(token: &str) -> Result<Claims>".to_owned()),
2773            ..Default::default()
2774        };
2775
2776        let summary = generate_summary(ChunkKind::Function, &source, &context);
2777        assert_eq!(summary, Some("Verify a JWT token and return the claims.".to_owned()));
2778    }
2779
2780    #[test]
2781    fn test_summary_heuristic_for_function() {
2782        let source = ChunkSource {
2783            repo: RepoIdentifier::default(),
2784            file: "src/auth/jwt.rs".to_owned(),
2785            lines: (10, 20),
2786            symbol: "verify_token".to_owned(),
2787            fqn: None,
2788            language: "Rust".to_owned(),
2789            parent: None,
2790            visibility: Visibility::Public,
2791            is_test: false,
2792            module_path: None,
2793            parent_chunk_id: None,
2794        };
2795        let context = ChunkContext {
2796            signature: Some("pub fn verify_token(token: &str) -> Result<Claims>".to_owned()),
2797            ..Default::default()
2798        };
2799
2800        let summary = generate_summary(ChunkKind::Function, &source, &context);
2801        assert_eq!(
2802            summary,
2803            Some(
2804                "Public function 'verify_token' in auth::jwt -- pub fn verify_token(token: &str) -> Result<Claims>"
2805                    .to_owned()
2806            )
2807        );
2808    }
2809
2810    #[test]
2811    fn test_summary_heuristic_for_struct() {
2812        let source = ChunkSource {
2813            repo: RepoIdentifier::default(),
2814            file: "lib/models/user.py".to_owned(),
2815            lines: (1, 30),
2816            symbol: "User".to_owned(),
2817            fqn: None,
2818            language: "Python".to_owned(),
2819            parent: None,
2820            visibility: Visibility::Public,
2821            is_test: false,
2822            module_path: None,
2823            parent_chunk_id: None,
2824        };
2825        let context = ChunkContext::default();
2826
2827        let summary = generate_summary(ChunkKind::Class, &source, &context);
2828        assert_eq!(summary, Some("Public class 'User' in models::user".to_owned()));
2829    }
2830
2831    #[test]
2832    fn test_summary_none_for_imports() {
2833        let source = ChunkSource {
2834            repo: RepoIdentifier::default(),
2835            file: "src/lib.rs".to_owned(),
2836            lines: (1, 5),
2837            symbol: "<imports>".to_owned(),
2838            fqn: None,
2839            language: "Rust".to_owned(),
2840            parent: None,
2841            visibility: Visibility::Public,
2842            is_test: false,
2843            module_path: None,
2844            parent_chunk_id: None,
2845        };
2846        let context = ChunkContext::default();
2847
2848        let summary = generate_summary(ChunkKind::Imports, &source, &context);
2849        assert!(summary.is_none(), "Import chunks should not have a summary");
2850    }
2851
2852    #[test]
2853    fn test_summary_long_signature_truncated() {
2854        let long_sig = format!(
2855            "pub fn process({})",
2856            (0..50)
2857                .map(|i| format!("arg{}: SomeVeryLongTypeName", i))
2858                .collect::<Vec<_>>()
2859                .join(", ")
2860        );
2861        let source = ChunkSource {
2862            repo: RepoIdentifier::default(),
2863            file: "src/processor.rs".to_owned(),
2864            lines: (1, 100),
2865            symbol: "process".to_owned(),
2866            fqn: None,
2867            language: "Rust".to_owned(),
2868            parent: None,
2869            visibility: Visibility::Private,
2870            is_test: false,
2871            module_path: None,
2872            parent_chunk_id: None,
2873        };
2874        let context = ChunkContext { signature: Some(long_sig), ..Default::default() };
2875
2876        let summary = generate_summary(ChunkKind::Function, &source, &context).unwrap();
2877        // The signature part should be truncated to ~200 chars
2878        assert!(summary.contains("..."), "Long signature should be truncated with ellipsis");
2879        // The total summary should still be reasonable length
2880        assert!(summary.len() < 350, "Summary should be concise, got len={}", summary.len());
2881    }
2882
2883    #[test]
2884    fn test_summary_top_level() {
2885        let source = ChunkSource {
2886            repo: RepoIdentifier::default(),
2887            file: "src/main.rs".to_owned(),
2888            lines: (1, 50),
2889            symbol: "<top_level>".to_owned(),
2890            fqn: None,
2891            language: "Rust".to_owned(),
2892            parent: None,
2893            visibility: Visibility::Public,
2894            is_test: false,
2895            module_path: None,
2896            parent_chunk_id: None,
2897        };
2898        let context = ChunkContext::default();
2899
2900        let summary = generate_summary(ChunkKind::TopLevel, &source, &context);
2901        assert_eq!(summary, Some("Top-level code in src/main.rs".to_owned()));
2902    }
2903
2904    #[test]
2905    fn test_file_path_to_module() {
2906        assert_eq!(file_path_to_module("src/auth/jwt.rs"), "auth::jwt");
2907        assert_eq!(file_path_to_module("lib/models/user.py"), "models::user");
2908        assert_eq!(file_path_to_module("main/app.ts"), "app");
2909        assert_eq!(file_path_to_module("other/deep/path.go"), "other::deep::path");
2910    }
2911
2912    #[test]
2913    fn test_strip_doc_markers() {
2914        assert_eq!(strip_doc_markers("/// Hello world"), "Hello world");
2915        assert_eq!(strip_doc_markers("//! Module doc"), "Module doc");
2916        assert_eq!(strip_doc_markers("/** Java doc */"), "Java doc");
2917        assert_eq!(strip_doc_markers("# Python doc"), "Python doc");
2918        assert_eq!(strip_doc_markers("\"\"\"Triple quoted\"\"\""), "Triple quoted");
2919        assert_eq!(strip_doc_markers("  * Javadoc line"), "Javadoc line");
2920        assert_eq!(strip_doc_markers("Plain text"), "Plain text");
2921    }
2922
2923    #[test]
2924    fn test_summary_with_python_docstring() {
2925        let source = ChunkSource {
2926            repo: RepoIdentifier::default(),
2927            file: "src/utils.py".to_owned(),
2928            lines: (1, 10),
2929            symbol: "parse_config".to_owned(),
2930            fqn: None,
2931            language: "Python".to_owned(),
2932            parent: None,
2933            visibility: Visibility::Public,
2934            is_test: false,
2935            module_path: None,
2936            parent_chunk_id: None,
2937        };
2938        let context = ChunkContext {
2939            docstring: Some("\"\"\"Parse configuration from a YAML file.\"\"\"".to_owned()),
2940            ..Default::default()
2941        };
2942
2943        let summary = generate_summary(ChunkKind::Function, &source, &context);
2944        assert_eq!(summary, Some("Parse configuration from a YAML file.".to_owned()));
2945    }
2946}
infiniloom_engine/embedding/chunker.rs

infiniloom_engine/embedding/
chunker.rs