Skip to main content

codetether_agent/indexer/
mod.rs

1//! Codebase indexer
2//!
3//! Builds a lightweight, persistent JSON index of source files for fast
4//! workspace introspection and downstream search/ranking workflows.
5
6use anyhow::{Context, Result};
7use chrono::{DateTime, Utc};
8use serde::{Deserialize, Serialize};
9use sha2::{Digest, Sha256};
10use std::collections::{HashMap, HashSet};
11use std::path::{Path, PathBuf};
12use std::sync::Arc;
13use std::time::Duration;
14use walkdir::{DirEntry, WalkDir};
15
16const INDEX_VERSION: u32 = 3;
17const KNOWLEDGE_GRAPH_VERSION: u32 = 1;
18const LOCAL_EMBEDDING_PROVIDER: &str = "local/hash-embedding";
19const DEFAULT_EMBEDDING_PROVIDER: &str = "local";
20const DEFAULT_EMBEDDING_MODEL: &str = "hash-v1";
21const DISABLED_EMBEDDING_PROVIDER: &str = "disabled";
22const DISABLED_EMBEDDING_MODEL: &str = "disabled";
23const DEFAULT_EMBEDDING_DIMENSIONS: usize = 384;
24const DEFAULT_EMBEDDING_BATCH_SIZE: usize = 32;
25const DEFAULT_EMBEDDING_INPUT_CHARS: usize = 8_000;
26const DEFAULT_EMBEDDING_MAX_RETRIES: u32 = 3;
27const DEFAULT_EMBEDDING_RETRY_INITIAL_MS: u64 = 250;
28const DEFAULT_EMBEDDING_RETRY_MAX_MS: u64 = 2_000;
29const DEFAULT_RUN_KNOWLEDGE_MAX_FILE_SIZE_BYTES: u64 = 512 * 1024;
30
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct CodebaseIndex {
33    pub version: u32,
34    pub root: String,
35    pub generated_at: DateTime<Utc>,
36    pub embedding_provider: String,
37    pub embedding_model: String,
38    pub stats: IndexStats,
39    pub files: Vec<IndexedFile>,
40    pub knowledge_graph: KnowledgeGraph,
41}
42
43#[derive(Debug, Clone, Default, Serialize, Deserialize)]
44pub struct IndexStats {
45    pub total_seen_files: u64,
46    pub indexed_files: u64,
47    pub skipped_hidden: u64,
48    pub skipped_non_text: u64,
49    pub skipped_large: u64,
50    pub skipped_io_errors: u64,
51    pub total_bytes: u64,
52    pub total_lines: u64,
53    pub embedded_files: u64,
54    pub embedding_dimensions: u32,
55    pub embedding_prompt_tokens: u64,
56    pub embedding_total_tokens: u64,
57    pub language_counts: HashMap<String, u64>,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
61pub struct IndexedFile {
62    pub path: String,
63    pub language: String,
64    pub bytes: u64,
65    pub lines: u32,
66    pub symbol_hints: u32,
67    pub modified_unix_ms: Option<i64>,
68    pub embedding: Vec<f32>,
69}
70
71#[derive(Debug, Clone, Default, Serialize, Deserialize)]
72pub struct KnowledgeGraphStats {
73    pub file_nodes: u64,
74    pub symbol_nodes: u64,
75    pub module_nodes: u64,
76    pub symbol_reference_nodes: u64,
77    pub edges: u64,
78}
79
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct KnowledgeGraph {
82    pub version: u32,
83    pub nodes: Vec<KnowledgeNode>,
84    pub edges: Vec<KnowledgeEdge>,
85    pub stats: KnowledgeGraphStats,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct KnowledgeNode {
90    pub id: String,
91    pub kind: String,
92    pub label: String,
93    pub file_path: Option<String>,
94    pub language: Option<String>,
95    pub symbol_kind: Option<String>,
96    pub line: Option<u32>,
97    pub external: bool,
98}
99
100#[derive(Debug, Clone, Serialize, Deserialize)]
101pub struct KnowledgeEdge {
102    pub source: String,
103    pub target: String,
104    pub kind: String,
105}
106
107#[derive(Debug, Clone)]
108pub struct BuildOptions {
109    pub include_hidden: bool,
110    pub include_embeddings: bool,
111    pub max_file_size_bytes: u64,
112    pub embedding_provider: String,
113    pub embedding_model: String,
114    pub embedding_dimensions: usize,
115    pub embedding_batch_size: usize,
116    pub embedding_input_chars: usize,
117    pub embedding_max_retries: u32,
118    pub embedding_retry_initial_ms: u64,
119    pub embedding_retry_max_ms: u64,
120}
121
122impl Default for BuildOptions {
123    fn default() -> Self {
124        Self {
125            include_hidden: false,
126            include_embeddings: true,
127            max_file_size_bytes: 1024 * 1024,
128            embedding_provider: DEFAULT_EMBEDDING_PROVIDER.to_string(),
129            embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
130            embedding_dimensions: DEFAULT_EMBEDDING_DIMENSIONS,
131            embedding_batch_size: DEFAULT_EMBEDDING_BATCH_SIZE,
132            embedding_input_chars: DEFAULT_EMBEDDING_INPUT_CHARS,
133            embedding_max_retries: DEFAULT_EMBEDDING_MAX_RETRIES,
134            embedding_retry_initial_ms: DEFAULT_EMBEDDING_RETRY_INITIAL_MS,
135            embedding_retry_max_ms: DEFAULT_EMBEDDING_RETRY_MAX_MS,
136        }
137    }
138}
139
140pub async fn run(args: crate::cli::IndexArgs) -> Result<()> {
141    let root = args
142        .path
143        .clone()
144        .unwrap_or_else(|| std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")));
145    let root = root.canonicalize().unwrap_or_else(|_| root.clone());
146
147    let options = BuildOptions {
148        include_hidden: args.include_hidden,
149        include_embeddings: true,
150        max_file_size_bytes: args.max_file_size_kib.saturating_mul(1024),
151        embedding_provider: if args.embedding_provider.trim().is_empty() {
152            DEFAULT_EMBEDDING_PROVIDER.to_string()
153        } else {
154            args.embedding_provider.clone()
155        },
156        embedding_model: if args.embedding_model.trim().is_empty() {
157            DEFAULT_EMBEDDING_MODEL.to_string()
158        } else {
159            args.embedding_model.clone()
160        },
161        embedding_dimensions: args.embedding_dimensions.max(64),
162        embedding_batch_size: args.embedding_batch_size.max(1),
163        embedding_input_chars: args.embedding_input_chars.max(256),
164        embedding_max_retries: args.embedding_max_retries,
165        embedding_retry_initial_ms: args.embedding_retry_initial_ms.max(1),
166        embedding_retry_max_ms: args
167            .embedding_retry_max_ms
168            .max(args.embedding_retry_initial_ms.max(1)),
169    };
170
171    let index = build_index(&root, &options).await?;
172    let output_path = args.output.unwrap_or_else(|| default_index_path(&root));
173
174    if let Some(parent) = output_path.parent() {
175        tokio::fs::create_dir_all(parent).await?;
176    }
177
178    let encoded = serde_json::to_string_pretty(&index)?;
179    tokio::fs::write(&output_path, encoded).await?;
180
181    if args.json {
182        let payload = serde_json::json!({
183            "index_path": output_path,
184            "root": index.root,
185            "generated_at": index.generated_at,
186            "embedding_provider": index.embedding_provider,
187            "embedding_model": index.embedding_model,
188            "stats": index.stats,
189            "knowledge_graph": index.knowledge_graph.stats,
190        });
191        println!("{}", serde_json::to_string_pretty(&payload)?);
192    } else {
193        println!("# Codebase Index Built\n");
194        println!("- Root: {}", index.root);
195        println!("- Output: {}", output_path.display());
196        println!(
197            "- Embeddings: {}/{}",
198            index.embedding_provider, index.embedding_model
199        );
200        println!("- Indexed files: {}", index.stats.indexed_files);
201        println!("- Embedded files: {}", index.stats.embedded_files);
202        println!(
203            "- Embedding dimensions: {}",
204            index.stats.embedding_dimensions
205        );
206        println!("- Total lines: {}", index.stats.total_lines);
207        println!("- Total bytes: {}", index.stats.total_bytes);
208        println!(
209            "- Knowledge graph: {} nodes / {} edges",
210            index.knowledge_graph.nodes.len(),
211            index.knowledge_graph.edges.len()
212        );
213        if !index.stats.language_counts.is_empty() {
214            let mut langs: Vec<_> = index.stats.language_counts.iter().collect();
215            langs.sort_by(|a, b| b.1.cmp(a.1).then_with(|| a.0.cmp(b.0)));
216            println!("\nTop languages:");
217            for (lang, count) in langs.into_iter().take(10) {
218                println!("- {}: {} files", lang, count);
219            }
220        }
221    }
222
223    Ok(())
224}
225
226pub async fn refresh_workspace_knowledge_snapshot(root: &Path) -> Result<PathBuf> {
227    let root = root.canonicalize().unwrap_or_else(|_| root.to_path_buf());
228    let options = BuildOptions {
229        include_hidden: false,
230        include_embeddings: false,
231        max_file_size_bytes: DEFAULT_RUN_KNOWLEDGE_MAX_FILE_SIZE_BYTES,
232        ..BuildOptions::default()
233    };
234    let index = build_index(&root, &options).await?;
235    let output_path = default_knowledge_graph_path(&root);
236
237    if let Some(parent) = output_path.parent() {
238        tokio::fs::create_dir_all(parent).await?;
239    }
240
241    let encoded = serde_json::to_string_pretty(&index)?;
242    tokio::fs::write(&output_path, encoded).await?;
243    Ok(output_path)
244}
245
246#[derive(Debug, Clone)]
247struct AnalyzedFileKnowledge {
248    file_node: KnowledgeNode,
249    symbol_nodes: Vec<KnowledgeNode>,
250    imported_modules: Vec<String>,
251    imported_symbols: Vec<String>,
252}
253
254pub async fn build_index(root: &Path, options: &BuildOptions) -> Result<CodebaseIndex> {
255    let mut stats = IndexStats::default();
256    let mut files = Vec::new();
257    let mut embedding_inputs = Vec::new();
258    let mut knowledge_inputs = Vec::new();
259
260    let walker = WalkDir::new(root)
261        .follow_links(false)
262        .into_iter()
263        .filter_entry(|entry| should_descend(entry, root, options.include_hidden));
264
265    for entry in walker.filter_map(std::result::Result::ok) {
266        let path = entry.path();
267        if !path.is_file() {
268            continue;
269        }
270
271        stats.total_seen_files += 1;
272
273        let rel_path = path.strip_prefix(root).unwrap_or(path);
274
275        if !options.include_hidden && is_hidden_path(rel_path) {
276            stats.skipped_hidden += 1;
277            continue;
278        }
279
280        let metadata = match std::fs::metadata(path) {
281            Ok(meta) => meta,
282            Err(_) => {
283                stats.skipped_io_errors += 1;
284                continue;
285            }
286        };
287
288        if metadata.len() > options.max_file_size_bytes {
289            stats.skipped_large += 1;
290            continue;
291        }
292
293        if !is_probably_text_file(path) {
294            stats.skipped_non_text += 1;
295            continue;
296        }
297
298        let content = match std::fs::read_to_string(path) {
299            Ok(text) => text,
300            Err(_) => {
301                stats.skipped_non_text += 1;
302                continue;
303            }
304        };
305
306        let language = detect_language(path);
307        let lines = if content.is_empty() {
308            0
309        } else {
310            (content.as_bytes().iter().filter(|b| **b == b'\n').count() + 1) as u32
311        };
312        let symbol_hints = estimate_symbol_hints(path, &content);
313
314        let rel_path = rel_path.to_string_lossy().to_string();
315
316        let modified_unix_ms = metadata
317            .modified()
318            .ok()
319            .and_then(|ts| ts.duration_since(std::time::UNIX_EPOCH).ok())
320            .map(|dur| dur.as_millis() as i64);
321
322        files.push(IndexedFile {
323            path: rel_path.clone(),
324            language: language.clone(),
325            bytes: metadata.len(),
326            lines,
327            symbol_hints,
328            modified_unix_ms,
329            embedding: Vec::new(),
330        });
331        knowledge_inputs.push(analyze_file_knowledge(
332            &rel_path,
333            &language,
334            lines,
335            metadata.len(),
336            modified_unix_ms,
337            &content,
338        ));
339        embedding_inputs.push(build_embedding_input(
340            &rel_path,
341            &language,
342            &content,
343            options.embedding_input_chars,
344        ));
345
346        stats.indexed_files += 1;
347        stats.total_bytes += metadata.len();
348        stats.total_lines += u64::from(lines);
349        *stats.language_counts.entry(language).or_insert(0) += 1;
350    }
351
352    let (embedding_provider, embedding_model) = if options.include_embeddings {
353        let backend = resolve_embedding_backend(options).await?;
354        let batch_size = options.embedding_batch_size.max(1);
355        stats.embedding_dimensions = options.embedding_dimensions.max(64) as u32;
356
357        for start in (0..embedding_inputs.len()).step_by(batch_size) {
358            let end = (start + batch_size).min(embedding_inputs.len());
359            let embedding_slice = &embedding_inputs[start..end];
360            let (vectors, usage) = match &backend {
361                EmbeddingBackend::Local { engine, .. } => {
362                    let vectors = engine.embed_batch(embedding_slice);
363                    let mut local_prompt_tokens = 0u64;
364                    let mut local_total_tokens = 0u64;
365                    for input in embedding_slice {
366                        let approx_tokens = approximate_token_count(input);
367                        local_prompt_tokens += approx_tokens;
368                        local_total_tokens += approx_tokens;
369                    }
370                    (vectors, (local_prompt_tokens, local_total_tokens))
371                }
372                EmbeddingBackend::Remote(engine) => {
373                    let response =
374                        engine.embed_batch(embedding_slice).await.with_context(|| {
375                            format!(
376                                "failed embedding batch {}-{} via provider {}/{}",
377                                start, end, engine.provider_name, engine.model
378                            )
379                        })?;
380
381                    let vectors = response.embeddings;
382                    let prompt_tokens = response.usage.prompt_tokens as u64;
383                    let total_tokens = response.usage.total_tokens as u64;
384                    (vectors, (prompt_tokens, total_tokens))
385                }
386            };
387
388            stats.embedding_prompt_tokens += usage.0;
389            stats.embedding_total_tokens += usage.1;
390
391            for (offset, vector) in vectors.into_iter().enumerate() {
392                let dim = vector.len() as u32;
393                if dim != stats.embedding_dimensions {
394                    anyhow::bail!(
395                        "embedding dimension mismatch: expected {}, got {} (provider: {}, model: {})",
396                        stats.embedding_dimensions,
397                        dim,
398                        backend.provider_name(),
399                        backend.model_name(),
400                    );
401                }
402
403                files[start + offset].embedding = vector;
404                stats.embedded_files += 1;
405            }
406        }
407
408        (
409            backend.provider_name().to_string(),
410            backend.model_name().to_string(),
411        )
412    } else {
413        (
414            DISABLED_EMBEDDING_PROVIDER.to_string(),
415            DISABLED_EMBEDDING_MODEL.to_string(),
416        )
417    };
418
419    files.sort_by(|a, b| a.path.cmp(&b.path));
420    let knowledge_graph = build_knowledge_graph(knowledge_inputs);
421
422    Ok(CodebaseIndex {
423        version: INDEX_VERSION,
424        root: root.display().to_string(),
425        generated_at: Utc::now(),
426        embedding_provider,
427        embedding_model,
428        stats,
429        files,
430        knowledge_graph,
431    })
432}
433
434fn build_knowledge_graph(files: Vec<AnalyzedFileKnowledge>) -> KnowledgeGraph {
435    let mut nodes = Vec::new();
436    let mut edges = Vec::new();
437    let mut seen_edges: HashSet<(String, String, String)> = HashSet::new();
438    let mut symbol_index: HashMap<String, Vec<String>> = HashMap::new();
439    let mut module_nodes: HashMap<String, String> = HashMap::new();
440    let mut symbol_ref_nodes: HashMap<String, String> = HashMap::new();
441    let mut stats = KnowledgeGraphStats::default();
442
443    for file in &files {
444        nodes.push(file.file_node.clone());
445        stats.file_nodes += 1;
446
447        for symbol in &file.symbol_nodes {
448            nodes.push(symbol.clone());
449            stats.symbol_nodes += 1;
450            symbol_index
451                .entry(symbol.label.clone())
452                .or_default()
453                .push(symbol.id.clone());
454            push_knowledge_edge(
455                &mut edges,
456                &mut seen_edges,
457                &file.file_node.id,
458                &symbol.id,
459                "defines",
460            );
461        }
462    }
463
464    for file in files {
465        for module in file.imported_modules {
466            let module_id = module_nodes
467                .entry(module.clone())
468                .or_insert_with(|| {
469                    stats.module_nodes += 1;
470                    let id = module_node_id(&module);
471                    nodes.push(KnowledgeNode {
472                        id: id.clone(),
473                        kind: "module".to_string(),
474                        label: module.clone(),
475                        file_path: None,
476                        language: None,
477                        symbol_kind: None,
478                        line: None,
479                        external: true,
480                    });
481                    id
482                })
483                .clone();
484
485            push_knowledge_edge(
486                &mut edges,
487                &mut seen_edges,
488                &file.file_node.id,
489                &module_id,
490                "imports_module",
491            );
492        }
493
494        for imported_symbol in file.imported_symbols {
495            let target_ids = symbol_index
496                .get(&imported_symbol)
497                .filter(|targets| !targets.is_empty() && targets.len() <= 8)
498                .cloned();
499
500            if let Some(target_ids) = target_ids {
501                for target_id in target_ids {
502                    push_knowledge_edge(
503                        &mut edges,
504                        &mut seen_edges,
505                        &file.file_node.id,
506                        &target_id,
507                        "imports_symbol",
508                    );
509                }
510                continue;
511            }
512
513            let symbol_ref_id = symbol_ref_nodes
514                .entry(imported_symbol.clone())
515                .or_insert_with(|| {
516                    stats.symbol_reference_nodes += 1;
517                    let id = external_symbol_node_id(&imported_symbol);
518                    nodes.push(KnowledgeNode {
519                        id: id.clone(),
520                        kind: "symbol_ref".to_string(),
521                        label: imported_symbol.clone(),
522                        file_path: None,
523                        language: None,
524                        symbol_kind: None,
525                        line: None,
526                        external: true,
527                    });
528                    id
529                })
530                .clone();
531
532            push_knowledge_edge(
533                &mut edges,
534                &mut seen_edges,
535                &file.file_node.id,
536                &symbol_ref_id,
537                "imports_symbol",
538            );
539        }
540    }
541
542    nodes.sort_by(|a, b| a.id.cmp(&b.id));
543    edges.sort_by(|a, b| {
544        a.source
545            .cmp(&b.source)
546            .then_with(|| a.target.cmp(&b.target))
547            .then_with(|| a.kind.cmp(&b.kind))
548    });
549    stats.edges = edges.len() as u64;
550
551    KnowledgeGraph {
552        version: KNOWLEDGE_GRAPH_VERSION,
553        nodes,
554        edges,
555        stats,
556    }
557}
558
559fn push_knowledge_edge(
560    edges: &mut Vec<KnowledgeEdge>,
561    seen_edges: &mut HashSet<(String, String, String)>,
562    source: &str,
563    target: &str,
564    kind: &str,
565) {
566    let key = (source.to_string(), target.to_string(), kind.to_string());
567    if seen_edges.insert(key.clone()) {
568        edges.push(KnowledgeEdge {
569            source: key.0,
570            target: key.1,
571            kind: key.2,
572        });
573    }
574}
575
576fn analyze_file_knowledge(
577    rel_path: &str,
578    language: &str,
579    _lines: u32,
580    _bytes: u64,
581    _modified_unix_ms: Option<i64>,
582    content: &str,
583) -> AnalyzedFileKnowledge {
584    let file_id = file_node_id(rel_path);
585    let mut symbol_nodes = Vec::new();
586    let mut imported_modules = Vec::new();
587    let mut imported_symbols = Vec::new();
588    let mut seen_symbols: HashSet<(String, u32, String)> = HashSet::new();
589    let mut go_import_block = false;
590
591    for (idx, raw_line) in content.lines().enumerate() {
592        let line_no = idx as u32 + 1;
593        let line = raw_line.trim();
594        if line.is_empty() {
595            continue;
596        }
597
598        if let Some((symbol_kind, name)) = extract_symbol_definition(language, line) {
599            let key = (name.clone(), line_no, symbol_kind.to_string());
600            if seen_symbols.insert(key) {
601                symbol_nodes.push(KnowledgeNode {
602                    id: symbol_node_id(rel_path, &name, line_no),
603                    kind: "symbol".to_string(),
604                    label: name,
605                    file_path: Some(rel_path.to_string()),
606                    language: Some(language.to_string()),
607                    symbol_kind: Some(symbol_kind.to_string()),
608                    line: Some(line_no),
609                    external: false,
610                });
611            }
612        }
613
614        extract_import_references(
615            language,
616            line,
617            &mut go_import_block,
618            &mut imported_modules,
619            &mut imported_symbols,
620        );
621    }
622
623    imported_modules.sort();
624    imported_modules.dedup();
625    imported_symbols.sort();
626    imported_symbols.dedup();
627
628    let file_node = KnowledgeNode {
629        id: file_id,
630        kind: "file".to_string(),
631        label: rel_path.to_string(),
632        file_path: Some(rel_path.to_string()),
633        language: Some(language.to_string()),
634        symbol_kind: None,
635        line: None,
636        external: false,
637    };
638
639    AnalyzedFileKnowledge {
640        file_node,
641        symbol_nodes,
642        imported_modules,
643        imported_symbols,
644    }
645}
646
647fn extract_symbol_definition(language: &str, line: &str) -> Option<(&'static str, String)> {
648    match language {
649        "rust" => extract_rust_symbol_definition(line),
650        "python" => extract_python_symbol_definition(line),
651        "typescript" | "javascript" => extract_script_symbol_definition(line),
652        "go" => extract_go_symbol_definition(line),
653        _ => None,
654    }
655}
656
657fn extract_rust_symbol_definition(line: &str) -> Option<(&'static str, String)> {
658    let normalized = strip_prefixes(
659        line,
660        &[
661            "pub(crate) ",
662            "pub(super) ",
663            "pub(self) ",
664            "pub ",
665            "async ",
666            "unsafe ",
667        ],
668    );
669
670    for (keyword, kind) in [
671        ("fn", "function"),
672        ("struct", "struct"),
673        ("enum", "enum"),
674        ("trait", "trait"),
675        ("mod", "module"),
676        ("type", "type"),
677        ("const", "const"),
678        ("static", "static"),
679    ] {
680        if let Some(name) = extract_identifier_after_keyword(normalized, keyword) {
681            return Some((kind, name));
682        }
683    }
684
685    None
686}
687
688fn extract_python_symbol_definition(line: &str) -> Option<(&'static str, String)> {
689    let normalized = strip_prefixes(line, &["async "]);
690    if let Some(name) = extract_identifier_after_keyword(normalized, "def") {
691        return Some(("function", name));
692    }
693    if let Some(name) = extract_identifier_after_keyword(normalized, "class") {
694        return Some(("class", name));
695    }
696    None
697}
698
699fn extract_script_symbol_definition(line: &str) -> Option<(&'static str, String)> {
700    let normalized = strip_prefixes(line, &["export default ", "export ", "default ", "async "]);
701
702    for (keyword, kind) in [
703        ("function", "function"),
704        ("class", "class"),
705        ("interface", "interface"),
706        ("type", "type"),
707        ("enum", "enum"),
708    ] {
709        if let Some(name) = extract_identifier_after_keyword(normalized, keyword) {
710            return Some((kind, name));
711        }
712    }
713
714    for keyword in ["const", "let", "var"] {
715        if let Some(name) = extract_identifier_after_keyword(normalized, keyword)
716            && (normalized.contains("=>") || normalized.contains("function("))
717        {
718            return Some(("variable", name));
719        }
720    }
721
722    None
723}
724
725fn extract_go_symbol_definition(line: &str) -> Option<(&'static str, String)> {
726    if let Some(name) = extract_identifier_after_keyword(line, "func") {
727        return Some(("function", name));
728    }
729    if let Some(name) = extract_identifier_after_keyword(line, "type") {
730        return Some(("type", name));
731    }
732    if let Some(name) = extract_identifier_after_keyword(line, "const") {
733        return Some(("const", name));
734    }
735    if let Some(name) = extract_identifier_after_keyword(line, "var") {
736        return Some(("variable", name));
737    }
738    None
739}
740
741fn extract_import_references(
742    language: &str,
743    line: &str,
744    go_import_block: &mut bool,
745    imported_modules: &mut Vec<String>,
746    imported_symbols: &mut Vec<String>,
747) {
748    match language {
749        "rust" => extract_rust_imports(line, imported_modules, imported_symbols),
750        "python" => extract_python_imports(line, imported_modules, imported_symbols),
751        "typescript" | "javascript" => {
752            extract_script_imports(line, imported_modules, imported_symbols);
753        }
754        "go" => extract_go_imports(line, go_import_block, imported_modules, imported_symbols),
755        _ => {}
756    }
757}
758
759fn extract_rust_imports(
760    line: &str,
761    imported_modules: &mut Vec<String>,
762    imported_symbols: &mut Vec<String>,
763) {
764    let normalized = strip_prefixes(line, &["pub "]);
765    let Some(spec) = normalized.strip_prefix("use ") else {
766        return;
767    };
768    let spec = spec.trim_end_matches(';').trim();
769    if spec.is_empty() {
770        return;
771    }
772
773    imported_modules.push(spec.to_string());
774    for segment in spec.split(&['{', '}', ','][..]) {
775        let segment = segment.trim();
776        if segment.is_empty() {
777            continue;
778        }
779
780        let alias_free = segment.split(" as ").next().unwrap_or(segment).trim();
781        let last = alias_free.rsplit("::").next().unwrap_or(alias_free).trim();
782        if last.is_empty() || matches!(last, "self" | "super" | "crate" | "*") {
783            continue;
784        }
785        if let Some(name) = sanitize_identifier(last) {
786            imported_symbols.push(name);
787        }
788    }
789}
790
791fn extract_python_imports(
792    line: &str,
793    imported_modules: &mut Vec<String>,
794    imported_symbols: &mut Vec<String>,
795) {
796    if let Some(rest) = line.strip_prefix("import ") {
797        for module in rest.split(',') {
798            let module = module.trim();
799            let module = module.split_whitespace().next().unwrap_or("");
800            if module.is_empty() {
801                continue;
802            }
803            imported_modules.push(module.to_string());
804            if let Some(name) = module.rsplit('.').next().and_then(sanitize_identifier) {
805                imported_symbols.push(name);
806            }
807        }
808        return;
809    }
810
811    let Some(rest) = line.strip_prefix("from ") else {
812        return;
813    };
814    let Some((module, names)) = rest.split_once(" import ") else {
815        return;
816    };
817    let module = module.trim();
818    if !module.is_empty() {
819        imported_modules.push(module.to_string());
820    }
821    for name in names.split(',') {
822        let name = name.trim();
823        let alias_free = name.split(" as ").next().unwrap_or(name).trim();
824        if let Some(clean) = sanitize_identifier(alias_free) {
825            imported_symbols.push(clean);
826        }
827    }
828}
829
830fn extract_script_imports(
831    line: &str,
832    imported_modules: &mut Vec<String>,
833    imported_symbols: &mut Vec<String>,
834) {
835    let trimmed = line.trim();
836    let is_module_import = trimmed.starts_with("import ")
837        || (trimmed.starts_with("export ") && trimmed.contains(" from "));
838    if !is_module_import && !trimmed.contains("require(") {
839        return;
840    }
841
842    if let Some(module) = extract_quoted_literal(trimmed) {
843        imported_modules.push(module.clone());
844        if let Some(name) = module.rsplit('/').next().and_then(sanitize_identifier) {
845            imported_symbols.push(name);
846        }
847    }
848
849    if let Some((before_from, _)) = trimmed.split_once(" from ") {
850        if let Some((default_import, _)) = before_from
851            .trim_start_matches("import ")
852            .trim_start_matches("export ")
853            .split_once(',')
854        {
855            let default_import = default_import.trim();
856            if !default_import.is_empty() && !default_import.starts_with('{') {
857                if let Some(name) = sanitize_identifier(default_import) {
858                    imported_symbols.push(name);
859                }
860            }
861        }
862    }
863
864    if let Some(braced) = extract_braced_section(trimmed) {
865        for name in braced.split(',') {
866            let name = name.trim();
867            let alias_free = name.split(" as ").next().unwrap_or(name).trim();
868            let alias_free = alias_free.trim_start_matches("type ").trim();
869            if let Some(clean) = sanitize_identifier(alias_free) {
870                imported_symbols.push(clean);
871            }
872        }
873    }
874}
875
876fn extract_go_imports(
877    line: &str,
878    go_import_block: &mut bool,
879    imported_modules: &mut Vec<String>,
880    imported_symbols: &mut Vec<String>,
881) {
882    let trimmed = line.trim();
883
884    if *go_import_block {
885        if trimmed == ")" {
886            *go_import_block = false;
887            return;
888        }
889        extract_go_import_entry(trimmed, imported_modules, imported_symbols);
890        return;
891    }
892
893    if trimmed == "import (" {
894        *go_import_block = true;
895        return;
896    }
897
898    if let Some(rest) = trimmed.strip_prefix("import ") {
899        extract_go_import_entry(rest.trim(), imported_modules, imported_symbols);
900    }
901}
902
903fn extract_go_import_entry(
904    line: &str,
905    imported_modules: &mut Vec<String>,
906    imported_symbols: &mut Vec<String>,
907) {
908    let Some(module) = extract_quoted_literal(line) else {
909        return;
910    };
911    imported_modules.push(module.clone());
912
913    let alias = line.split_whitespace().next().unwrap_or("");
914    if !alias.is_empty() && !alias.starts_with('"') && !matches!(alias, "_" | ".") {
915        if let Some(clean) = sanitize_identifier(alias) {
916            imported_symbols.push(clean);
917            return;
918        }
919    }
920
921    if let Some(name) = module.rsplit('/').next().and_then(sanitize_identifier) {
922        imported_symbols.push(name);
923    }
924}
925
926fn extract_identifier_after_keyword(line: &str, keyword: &str) -> Option<String> {
927    let prefix = format!("{keyword} ");
928    let rest = line.strip_prefix(&prefix)?;
929    sanitize_identifier(rest)
930}
931
932fn sanitize_identifier(input: &str) -> Option<String> {
933    let mut out = String::new();
934    for ch in input.chars() {
935        if ch.is_ascii_alphanumeric() || ch == '_' || ch == '$' {
936            out.push(ch);
937        } else {
938            break;
939        }
940    }
941
942    if out.is_empty() { None } else { Some(out) }
943}
944
945fn strip_prefixes<'a>(mut input: &'a str, prefixes: &[&str]) -> &'a str {
946    loop {
947        let mut matched = false;
948        for prefix in prefixes {
949            if let Some(rest) = input.strip_prefix(prefix) {
950                input = rest.trim_start();
951                matched = true;
952                break;
953            }
954        }
955
956        if !matched {
957            return input;
958        }
959    }
960}
961
962fn extract_quoted_literal(line: &str) -> Option<String> {
963    for quote in ['"', '\''] {
964        let mut parts = line.split(quote);
965        let _ = parts.next();
966        if let Some(value) = parts.next()
967            && !value.trim().is_empty()
968        {
969            return Some(value.trim().to_string());
970        }
971    }
972    None
973}
974
975fn extract_braced_section(line: &str) -> Option<String> {
976    let start = line.find('{')?;
977    let end = line[start + 1..].find('}')?;
978    Some(line[start + 1..start + 1 + end].to_string())
979}
980
981fn file_node_id(path: &str) -> String {
982    format!("file:{path}")
983}
984
985fn symbol_node_id(path: &str, name: &str, line: u32) -> String {
986    format!("symbol:{path}:{line}:{name}")
987}
988
989fn module_node_id(module: &str) -> String {
990    format!("module:{module}")
991}
992
993fn external_symbol_node_id(symbol: &str) -> String {
994    format!("symbol-ref:{symbol}")
995}
996
997enum EmbeddingBackend {
998    Local {
999        engine: LocalEmbeddingEngine,
1000        model: String,
1001    },
1002    Remote(RemoteEmbeddingEngine),
1003}
1004
1005impl EmbeddingBackend {
1006    fn provider_name(&self) -> &str {
1007        match self {
1008            Self::Local { .. } => LOCAL_EMBEDDING_PROVIDER,
1009            Self::Remote(engine) => &engine.provider_name,
1010        }
1011    }
1012
1013    fn model_name(&self) -> &str {
1014        match self {
1015            Self::Local { model, .. } => model,
1016            Self::Remote(engine) => &engine.model,
1017        }
1018    }
1019}
1020
1021#[derive(Clone)]
1022struct RemoteEmbeddingEngine {
1023    provider: Arc<dyn crate::provider::Provider>,
1024    provider_name: String,
1025    model: String,
1026    max_retries: u32,
1027    retry_initial: Duration,
1028    retry_max: Duration,
1029}
1030
1031impl RemoteEmbeddingEngine {
1032    async fn embed_batch(&self, inputs: &[String]) -> Result<crate::provider::EmbeddingResponse> {
1033        if inputs.is_empty() {
1034            return Ok(crate::provider::EmbeddingResponse {
1035                embeddings: Vec::new(),
1036                usage: crate::provider::Usage::default(),
1037            });
1038        }
1039
1040        let mut attempt = 0u32;
1041        loop {
1042            let request = crate::provider::EmbeddingRequest {
1043                model: self.model.clone(),
1044                inputs: inputs.to_vec(),
1045            };
1046
1047            match self.provider.embed(request).await {
1048                Ok(response) => return Ok(response),
1049                Err(err) => {
1050                    let should_retry =
1051                        attempt < self.max_retries && is_retryable_embedding_error(&err);
1052                    if !should_retry {
1053                        return Err(anyhow::anyhow!(
1054                            "remote embedding failed via {}/{} after {} attempt(s): {}",
1055                            self.provider_name,
1056                            self.model,
1057                            attempt + 1,
1058                            err
1059                        ));
1060                    }
1061
1062                    let delay = retry_delay(attempt, self.retry_initial, self.retry_max);
1063                    tracing::warn!(
1064                        provider = %self.provider_name,
1065                        model = %self.model,
1066                        attempt = attempt + 1,
1067                        retry_in_ms = delay.as_millis(),
1068                        error = %err,
1069                        "Embedding batch failed, retrying"
1070                    );
1071
1072                    tokio::time::sleep(delay).await;
1073                    attempt += 1;
1074                }
1075            }
1076        }
1077    }
1078}
1079
1080async fn resolve_embedding_backend(options: &BuildOptions) -> Result<EmbeddingBackend> {
1081    let dimensions = options.embedding_dimensions.max(64);
1082    if is_local_embedding_provider(&options.embedding_provider) {
1083        return Ok(EmbeddingBackend::Local {
1084            engine: LocalEmbeddingEngine::new(dimensions),
1085            model: options.embedding_model.clone(),
1086        });
1087    }
1088
1089    let model_selector =
1090        build_model_selector(&options.embedding_provider, &options.embedding_model)?;
1091    let registry = crate::provider::ProviderRegistry::from_vault().await?;
1092    let (provider, model) = registry
1093        .resolve_model(&model_selector)
1094        .with_context(|| format!("failed resolving embedding model '{model_selector}'"))?;
1095
1096    let retry_initial = Duration::from_millis(options.embedding_retry_initial_ms.max(1));
1097    let retry_max = Duration::from_millis(options.embedding_retry_max_ms.max(1));
1098
1099    Ok(EmbeddingBackend::Remote(RemoteEmbeddingEngine {
1100        provider_name: provider.name().to_string(),
1101        provider,
1102        model,
1103        max_retries: options.embedding_max_retries,
1104        retry_initial,
1105        retry_max,
1106    }))
1107}
1108
1109fn is_local_embedding_provider(value: &str) -> bool {
1110    matches!(
1111        value.trim().to_ascii_lowercase().as_str(),
1112        "local" | "hash" | "hash-embedding" | "local/hash-embedding"
1113    )
1114}
1115
1116fn build_model_selector(provider: &str, model: &str) -> Result<String> {
1117    let provider = provider.trim();
1118    let model = model.trim();
1119
1120    if model.is_empty() {
1121        anyhow::bail!("embedding model cannot be empty");
1122    }
1123
1124    if model.contains('/') {
1125        return Ok(model.to_string());
1126    }
1127
1128    if provider.is_empty() {
1129        anyhow::bail!(
1130            "embedding provider cannot be empty when model does not include provider prefix"
1131        );
1132    }
1133
1134    Ok(format!("{provider}/{model}"))
1135}
1136
1137fn retry_delay(attempt: u32, initial: Duration, max: Duration) -> Duration {
1138    let multiplier = 2u128.saturating_pow(attempt);
1139    let initial_ms = initial.as_millis();
1140    let max_ms = max.as_millis().max(initial_ms);
1141    let delay_ms = initial_ms.saturating_mul(multiplier).min(max_ms);
1142    Duration::from_millis(delay_ms as u64)
1143}
1144
1145fn is_retryable_embedding_error(error: &anyhow::Error) -> bool {
1146    let msg = error.to_string().to_ascii_lowercase();
1147    [
1148        "timeout",
1149        "timed out",
1150        "connection reset",
1151        "connection refused",
1152        "temporary",
1153        "temporarily unavailable",
1154        "rate limit",
1155        "too many requests",
1156        " 429",
1157        " 500",
1158        " 502",
1159        " 503",
1160        " 504",
1161    ]
1162    .iter()
1163    .any(|needle| msg.contains(needle))
1164}
1165
1166fn approximate_token_count(text: &str) -> u64 {
1167    let words = text.split_whitespace().count() as u64;
1168    words.max(1)
1169}
1170
1171fn build_embedding_input(path: &str, language: &str, content: &str, max_chars: usize) -> String {
1172    let snippet = safe_char_prefix(content, max_chars);
1173    format!("path:{path}\nlanguage:{language}\n\n{snippet}")
1174}
1175
1176fn safe_char_prefix(input: &str, max_chars: usize) -> String {
1177    input.chars().take(max_chars).collect()
1178}
1179
1180#[derive(Debug, Clone)]
1181struct LocalEmbeddingEngine {
1182    dimensions: usize,
1183}
1184
1185impl LocalEmbeddingEngine {
1186    fn new(dimensions: usize) -> Self {
1187        Self { dimensions }
1188    }
1189
1190    fn embed_batch(&self, inputs: &[String]) -> Vec<Vec<f32>> {
1191        inputs
1192            .iter()
1193            .map(|input| self.embed_single(input))
1194            .collect()
1195    }
1196
1197    fn embed_single(&self, input: &str) -> Vec<f32> {
1198        let mut vector = vec![0.0f32; self.dimensions];
1199        let tokens = tokenize_for_embedding(input);
1200
1201        if tokens.is_empty() {
1202            self.accumulate_char_ngrams(&mut vector, input);
1203        } else {
1204            for (idx, token) in tokens.iter().enumerate() {
1205                let positional_weight = 1.0f32 / (1.0 + (idx as f32 / 128.0));
1206                self.accumulate_token(&mut vector, token, positional_weight);
1207
1208                if let Some(next) = tokens.get(idx + 1) {
1209                    let bigram = format!("{token} {next}");
1210                    self.accumulate_token(&mut vector, &bigram, positional_weight * 0.65);
1211                }
1212            }
1213        }
1214
1215        l2_normalize(&mut vector);
1216        vector
1217    }
1218
1219    fn accumulate_char_ngrams(&self, vector: &mut [f32], input: &str) {
1220        for ngram in input.as_bytes().windows(3).take(2048) {
1221            let key = String::from_utf8_lossy(ngram);
1222            self.accumulate_token(vector, &key, 0.5);
1223        }
1224    }
1225
1226    fn accumulate_token(&self, vector: &mut [f32], token: &str, weight: f32) {
1227        if token.is_empty() {
1228            return;
1229        }
1230
1231        let digest = Sha256::digest(token.as_bytes());
1232        let len = vector.len();
1233
1234        let idx_a = (u16::from_le_bytes([digest[0], digest[1]]) as usize) % len;
1235        let idx_b = (u16::from_le_bytes([digest[2], digest[3]]) as usize) % len;
1236        let idx_c = (u16::from_le_bytes([digest[4], digest[5]]) as usize) % len;
1237
1238        let sign_a = if digest[6] & 1 == 0 { 1.0 } else { -1.0 };
1239        let sign_b = if digest[7] & 1 == 0 { 1.0 } else { -1.0 };
1240        let sign_c = if digest[8] & 1 == 0 { 1.0 } else { -1.0 };
1241
1242        vector[idx_a] += sign_a * weight;
1243        vector[idx_b] += sign_b * (weight * 0.7);
1244        vector[idx_c] += sign_c * (weight * 0.4);
1245    }
1246}
1247
1248fn tokenize_for_embedding(input: &str) -> Vec<String> {
1249    let mut tokens = Vec::new();
1250    let mut current = String::new();
1251
1252    for ch in input.chars() {
1253        if ch.is_ascii_alphanumeric() || ch == '_' {
1254            current.push(ch.to_ascii_lowercase());
1255        } else if !current.is_empty() {
1256            tokens.push(std::mem::take(&mut current));
1257            if tokens.len() >= 4096 {
1258                return tokens;
1259            }
1260        }
1261    }
1262
1263    if !current.is_empty() {
1264        tokens.push(current);
1265    }
1266
1267    tokens
1268}
1269
1270fn l2_normalize(values: &mut [f32]) {
1271    let norm = values.iter().map(|v| v * v).sum::<f32>().sqrt();
1272    if norm > 0.0 {
1273        for value in values {
1274            *value /= norm;
1275        }
1276    }
1277}
1278
1279fn default_index_path(root: &Path) -> PathBuf {
1280    let mut hasher = Sha256::new();
1281    hasher.update(root.to_string_lossy().as_bytes());
1282    let digest = hasher.finalize();
1283    let short = hex::encode(digest);
1284    let short = &short[..16];
1285
1286    let base = crate::config::Config::data_dir().unwrap_or_else(|| root.join(".codetether-agent"));
1287    base.join("indexes")
1288        .join(format!("codebase-index-{short}.json"))
1289}
1290
1291fn default_knowledge_graph_path(root: &Path) -> PathBuf {
1292    let mut hasher = Sha256::new();
1293    hasher.update(root.to_string_lossy().as_bytes());
1294    let digest = hasher.finalize();
1295    let short = hex::encode(digest);
1296    let short = &short[..16];
1297
1298    let base = crate::config::Config::data_dir().unwrap_or_else(|| root.join(".codetether-agent"));
1299    base.join("indexes")
1300        .join(format!("workspace-knowledge-{short}.json"))
1301}
1302
1303fn should_descend(entry: &DirEntry, root: &Path, include_hidden: bool) -> bool {
1304    let path = entry.path();
1305    let rel_path = path.strip_prefix(root).unwrap_or(path);
1306
1307    if !include_hidden && is_hidden_path(rel_path) {
1308        return false;
1309    }
1310
1311    !crate::workspace_scan::path_has_pruned_component(rel_path)
1312}
1313
1314fn is_hidden_path(path: &Path) -> bool {
1315    crate::workspace_scan::path_is_hidden(path)
1316}
1317
1318fn is_probably_text_file(path: &Path) -> bool {
1319    let text_exts = [
1320        "rs", "ts", "js", "tsx", "jsx", "py", "go", "java", "kt", "c", "cpp", "h", "hpp", "md",
1321        "txt", "json", "yaml", "yml", "toml", "sh", "bash", "zsh", "html", "css", "scss", "sql",
1322        "proto", "xml", "ini", "env", "lock",
1323    ];
1324
1325    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1326        if text_exts.contains(&ext) {
1327            return true;
1328        }
1329    }
1330
1331    if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
1332        return matches!(name, "Dockerfile" | "Makefile" | "Jenkinsfile" | "README");
1333    }
1334
1335    false
1336}
1337
1338fn detect_language(path: &Path) -> String {
1339    let ext = path
1340        .extension()
1341        .and_then(|e| e.to_str())
1342        .unwrap_or_default()
1343        .to_ascii_lowercase();
1344
1345    match ext.as_str() {
1346        "rs" => "rust",
1347        "ts" | "tsx" => "typescript",
1348        "js" | "jsx" => "javascript",
1349        "py" => "python",
1350        "go" => "go",
1351        "java" => "java",
1352        "kt" => "kotlin",
1353        "c" | "h" => "c",
1354        "cpp" | "hpp" | "cc" | "cxx" => "cpp",
1355        "json" => "json",
1356        "yaml" | "yml" => "yaml",
1357        "toml" => "toml",
1358        "md" => "markdown",
1359        "sh" | "bash" | "zsh" => "shell",
1360        "proto" => "proto",
1361        "sql" => "sql",
1362        "html" => "html",
1363        "css" | "scss" => "css",
1364        _ => {
1365            if let Some(name) = path.file_name().and_then(|n| n.to_str()) {
1366                match name {
1367                    "Dockerfile" => "dockerfile",
1368                    "Makefile" => "makefile",
1369                    "Jenkinsfile" => "groovy",
1370                    _ => "text",
1371                }
1372            } else {
1373                "text"
1374            }
1375        }
1376    }
1377    .to_string()
1378}
1379
1380fn estimate_symbol_hints(path: &Path, content: &str) -> u32 {
1381    let ext = path
1382        .extension()
1383        .and_then(|e| e.to_str())
1384        .unwrap_or_default()
1385        .to_ascii_lowercase();
1386
1387    let mut count = 0u32;
1388    for line in content.lines().map(str::trim_start) {
1389        let hit = match ext.as_str() {
1390            "rs" => estimate_rust_symbol_hint(line),
1391            "py" => line.starts_with("def ") || line.starts_with("class "),
1392            "ts" | "tsx" | "js" | "jsx" => {
1393                line.starts_with("function ")
1394                    || line.contains("=>")
1395                    || line.starts_with("class ")
1396                    || line.starts_with("export function ")
1397            }
1398            "go" => line.starts_with("func ") || line.starts_with("type "),
1399            "java" | "kt" => {
1400                line.contains(" class ")
1401                    || line.starts_with("class ")
1402                    || line.starts_with("interface ")
1403                    || line.contains(" fun ")
1404            }
1405            _ => false,
1406        };
1407
1408        if hit {
1409            count = count.saturating_add(1);
1410        }
1411    }
1412
1413    count
1414}
1415
1416fn estimate_rust_symbol_hint(line: &str) -> bool {
1417    let normalized = strip_prefixes(
1418        line,
1419        &[
1420            "pub(crate) ",
1421            "pub(super) ",
1422            "pub(self) ",
1423            "pub ",
1424            "async ",
1425            "unsafe ",
1426        ],
1427    );
1428
1429    normalized.starts_with("fn ")
1430        || normalized.starts_with("struct ")
1431        || normalized.starts_with("enum ")
1432        || normalized.starts_with("trait ")
1433        || normalized.starts_with("impl ")
1434        || normalized.starts_with("mod ")
1435        || normalized.starts_with("type ")
1436        || normalized.starts_with("const ")
1437        || normalized.starts_with("static ")
1438}
1439
1440#[cfg(test)]
1441mod tests {
1442    use super::*;
1443    use anyhow::anyhow;
1444    use tempfile::tempdir;
1445
1446    #[test]
1447    fn detects_hidden_paths() {
1448        assert!(is_hidden_path(Path::new(".git/config")));
1449        assert!(is_hidden_path(Path::new("src/.cache/file")));
1450        assert!(!is_hidden_path(Path::new("src/main.rs")));
1451    }
1452
1453    #[test]
1454    fn language_detection_works() {
1455        assert_eq!(detect_language(Path::new("src/main.rs")), "rust");
1456        assert_eq!(detect_language(Path::new("app.py")), "python");
1457        assert_eq!(detect_language(Path::new("Dockerfile")), "dockerfile");
1458    }
1459
1460    #[test]
1461    fn symbol_hint_estimation_works_for_rust() {
1462        let content = "pub struct A;\nimpl A {}\nfn run() {}\n";
1463        assert_eq!(estimate_symbol_hints(Path::new("src/lib.rs"), content), 3);
1464    }
1465
1466    #[test]
1467    fn local_embeddings_have_expected_dimensions() {
1468        let engine = LocalEmbeddingEngine::new(384);
1469        let vectors = engine.embed_batch(&["fn main() { println!(\"hi\") }".to_string()]);
1470        assert_eq!(vectors.len(), 1);
1471        assert_eq!(vectors[0].len(), 384);
1472    }
1473
1474    #[test]
1475    fn embedding_input_prefix_is_char_safe() {
1476        let input = "✓✓✓hello";
1477        let prefixed = build_embedding_input("src/main.rs", "rust", input, 2);
1478        assert!(prefixed.contains("✓✓"));
1479    }
1480
1481    #[test]
1482    fn local_embedding_provider_aliases_are_supported() {
1483        assert!(is_local_embedding_provider("local"));
1484        assert!(is_local_embedding_provider("local/hash-embedding"));
1485        assert!(is_local_embedding_provider("HASH"));
1486        assert!(!is_local_embedding_provider("huggingface"));
1487    }
1488
1489    #[test]
1490    fn model_selector_uses_explicit_prefix_when_missing() {
1491        let selector = build_model_selector("huggingface", "BAAI/bge-small-en-v1.5")
1492            .expect("model selector should build");
1493        assert_eq!(selector, "BAAI/bge-small-en-v1.5");
1494
1495        let selector = build_model_selector("huggingface", "text-embedding-3-large")
1496            .expect("model selector should build");
1497        assert_eq!(selector, "huggingface/text-embedding-3-large");
1498    }
1499
1500    #[test]
1501    fn retryable_embedding_error_detection_matches_transient_signals() {
1502        assert!(is_retryable_embedding_error(&anyhow!(
1503            "HTTP 429 too many requests"
1504        )));
1505        assert!(is_retryable_embedding_error(&anyhow!("gateway timeout")));
1506        assert!(!is_retryable_embedding_error(&anyhow!(
1507            "invalid embedding model"
1508        )));
1509    }
1510
1511    #[tokio::test]
1512    async fn build_index_emits_workspace_knowledge_graph() {
1513        let temp = tempdir().expect("tempdir");
1514        std::fs::write(temp.path().join("types.rs"), "pub struct Session;\n").expect("write");
1515        std::fs::write(
1516            temp.path().join("main.rs"),
1517            "use crate::types::Session;\nfn run() {}\n",
1518        )
1519        .expect("write");
1520
1521        let index = build_index(
1522            temp.path(),
1523            &BuildOptions {
1524                include_embeddings: false,
1525                ..BuildOptions::default()
1526            },
1527        )
1528        .await
1529        .expect("index should build");
1530
1531        assert_eq!(index.embedding_provider, DISABLED_EMBEDDING_PROVIDER);
1532        assert!(
1533            index
1534                .knowledge_graph
1535                .nodes
1536                .iter()
1537                .any(|node| node.kind == "symbol" && node.label == "Session")
1538        );
1539        assert!(
1540            index
1541                .knowledge_graph
1542                .edges
1543                .iter()
1544                .any(|edge| edge.kind == "imports_symbol" && edge.target.contains("Session"))
1545        );
1546    }
1547
1548    #[tokio::test]
1549    async fn refresh_workspace_knowledge_snapshot_writes_json() {
1550        let temp = tempdir().expect("tempdir");
1551        let data_dir = temp.path().join("data");
1552        std::fs::write(temp.path().join("lib.rs"), "pub fn run() {}\n").expect("write");
1553
1554        unsafe {
1555            std::env::set_var("CODETETHER_DATA_DIR", data_dir.display().to_string());
1556        }
1557
1558        let output_path = refresh_workspace_knowledge_snapshot(temp.path())
1559            .await
1560            .expect("snapshot should write");
1561        let payload = std::fs::read_to_string(&output_path).expect("snapshot payload");
1562
1563        unsafe {
1564            std::env::remove_var("CODETETHER_DATA_DIR");
1565        }
1566
1567        assert_eq!(
1568            output_path.extension().and_then(|ext| ext.to_str()),
1569            Some("json")
1570        );
1571        assert!(payload.contains("\"knowledge_graph\""));
1572        assert!(payload.contains("\"symbol\""));
1573    }
1574}