infiniloom_engine/embedding/types.rs
1//! Core types for embedding chunk generation
2//!
3//! This module defines the data structures used throughout the embedding system,
4//! including chunks, settings, and metadata types.
5
6use serde::{Deserialize, Serialize};
7
8use super::error::EmbedError;
9
10/// Repository identifier for multi-tenant RAG systems
11///
12/// This enables embedding multiple codebases into a single vector database
13/// while maintaining clear isolation and traceability. Essential for:
14/// - Multi-repository search with proper attribution
15/// - Access control based on repository ownership
16/// - Cross-repository dependency tracking
17/// - Audit trails for compliance (SOC2, GDPR)
18///
19/// # Example
20///
21/// ```
22/// use infiniloom_engine::embedding::RepoIdentifier;
23///
24/// let repo = RepoIdentifier {
25/// namespace: Some("github.com/myorg".to_string()),
26/// name: "auth-service".to_string(),
27/// version: Some("v2.1.0".to_string()),
28/// branch: Some("main".to_string()),
29/// commit: Some("abc123def".to_string()),
30/// };
31/// ```
32#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
33pub struct RepoIdentifier {
34 /// Namespace/organization (e.g., "github.com/myorg", "gitlab.com/team")
35 /// Used for grouping and access control
36 pub namespace: Option<String>,
37
38 /// Repository name (e.g., "auth-service", "frontend")
39 pub name: String,
40
41 /// Semantic version or tag (e.g., "v2.1.0", "release-2024.01")
42 #[serde(skip_serializing_if = "Option::is_none")]
43 pub version: Option<String>,
44
45 /// Branch name (e.g., "main", "feature/new-auth")
46 #[serde(skip_serializing_if = "Option::is_none")]
47 pub branch: Option<String>,
48
49 /// Git commit hash (short or full)
50 #[serde(skip_serializing_if = "Option::is_none")]
51 pub commit: Option<String>,
52}
53
54impl RepoIdentifier {
55 /// Create a new repository identifier
56 pub fn new(namespace: impl Into<String>, name: impl Into<String>) -> Self {
57 let ns: String = namespace.into();
58 Self {
59 namespace: if ns.is_empty() { None } else { Some(ns) },
60 name: name.into(),
61 version: None,
62 branch: None,
63 commit: None,
64 }
65 }
66
67 /// Create with full details including version and commit
68 pub fn full(
69 namespace: Option<String>,
70 name: impl Into<String>,
71 version: Option<String>,
72 branch: Option<String>,
73 commit: Option<String>,
74 ) -> Self {
75 Self { namespace, name: name.into(), version, branch, commit }
76 }
77
78 /// Get fully qualified repository name (namespace/name)
79 pub fn qualified_name(&self) -> String {
80 match &self.namespace {
81 Some(ns) if !ns.is_empty() => format!("{}/{}", ns, self.name),
82 _ => self.name.clone(),
83 }
84 }
85
86 /// Check if this identifier represents the same repository (ignores version/commit)
87 pub fn same_repo(&self, other: &Self) -> bool {
88 self.namespace == other.namespace && self.name == other.name
89 }
90}
91
92/// A single embedding chunk with stable, content-addressable ID
93///
94/// Each chunk represents a semantic unit of code (function, class, etc.) with
95/// a deterministic ID derived from its normalized content. This enables:
96/// - Cross-repository deduplication (same code = same ID)
97/// - Incremental updates (compare IDs to detect changes)
98/// - Stable references for vector databases
99#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
100pub struct EmbedChunk {
101 /// Content-addressable ID: BLAKE3 hash of normalized content
102 /// Format: "ec_" + 32 hex chars (128 bits) - collision-resistant for enterprise scale
103 pub id: String,
104
105 /// Full 256-bit hash for collision verification
106 #[serde(default)]
107 pub full_hash: String,
108
109 /// The actual code content (normalized)
110 pub content: String,
111
112 /// Token count for the target model
113 pub tokens: u32,
114
115 /// Symbol kind
116 pub kind: ChunkKind,
117
118 /// Source location metadata
119 pub source: ChunkSource,
120
121 /// Enriched context for better retrieval
122 pub context: ChunkContext,
123
124 /// IDs of child chunks (methods inside a class, etc.)
125 /// Sorted for determinism. Enables hierarchical navigation in RAG systems.
126 #[serde(default, skip_serializing_if = "Vec::is_empty")]
127 pub children_ids: Vec<String>,
128
129 /// Representation type: "code" (default) or "signature"
130 ///
131 /// Code chunks contain the full implementation. Signature chunks contain only
132 /// the declaration/signature, enabling tiered retrieval: search signatures
133 /// broadly, then fetch full code for top matches.
134 #[serde(default = "default_repr")]
135 pub repr: String,
136
137 /// For non-code representations, the ID of the full code chunk
138 ///
139 /// This links a signature chunk back to its corresponding code chunk,
140 /// enabling two-phase retrieval workflows.
141 #[serde(default, skip_serializing_if = "Option::is_none")]
142 pub code_chunk_id: Option<String>,
143
144 /// For split chunks: part N of M
145 #[serde(default, skip_serializing_if = "Option::is_none")]
146 pub part: Option<ChunkPart>,
147}
148
149/// Default representation type for chunks
150pub(super) fn default_repr() -> String {
151 "code".to_owned()
152}
153
154/// Source location metadata for a chunk
155///
156/// This metadata helps identify where the chunk originated, but importantly
157/// does NOT affect the chunk ID (which is based solely on content).
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
159pub struct ChunkSource {
160 /// Repository identifier for multi-tenant RAG
161 /// Essential for distinguishing chunks from different codebases
162 #[serde(default, skip_serializing_if = "is_default_repo")]
163 pub repo: RepoIdentifier,
164
165 /// Relative file path (from repo root, never absolute)
166 pub file: String,
167
168 /// Line range (1-indexed, inclusive)
169 pub lines: (u32, u32),
170
171 /// Symbol name
172 pub symbol: String,
173
174 /// Fully qualified name
175 #[serde(default, skip_serializing_if = "Option::is_none")]
176 pub fqn: Option<String>,
177
178 /// Programming language
179 pub language: String,
180
181 /// Parent symbol (for methods inside classes)
182 #[serde(default, skip_serializing_if = "Option::is_none")]
183 pub parent: Option<String>,
184
185 /// Visibility modifier
186 pub visibility: Visibility,
187
188 /// Whether this is test code
189 #[serde(default)]
190 pub is_test: bool,
191
192 /// Module path derived from file path and language conventions
193 /// e.g., "auth::jwt" for src/auth/jwt.rs in Rust
194 #[serde(default, skip_serializing_if = "Option::is_none")]
195 pub module_path: Option<String>,
196
197 /// Chunk ID of the parent container (class/struct/enum/trait/interface)
198 /// Enables hierarchical navigation in RAG systems
199 #[serde(default, skip_serializing_if = "Option::is_none")]
200 pub parent_chunk_id: Option<String>,
201}
202
203/// Helper for skip_serializing_if - skip if repo is default (empty)
204fn is_default_repo(repo: &RepoIdentifier) -> bool {
205 repo.namespace.is_none() && repo.name.is_empty()
206}
207
208/// Git metadata for a chunk's source file
209///
210/// Enriches chunks with version control history for temporal-aware retrieval.
211/// All fields are optional to gracefully handle non-git repos, shallow clones,
212/// and untracked files.
213#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
214pub struct GitMetadata {
215 /// ISO 8601 date of last modification
216 #[serde(default, skip_serializing_if = "Option::is_none")]
217 pub last_modified: Option<String>,
218
219 /// Number of commits touching this file in the lookback period (90 days)
220 #[serde(default, skip_serializing_if = "Option::is_none")]
221 pub change_frequency: Option<u32>,
222
223 /// Total commits ever touching this file
224 #[serde(default, skip_serializing_if = "Option::is_none")]
225 pub total_commits: Option<u32>,
226
227 /// Unique authors (sorted, deduplicated for determinism)
228 #[serde(default, skip_serializing_if = "Vec::is_empty")]
229 pub authors: Vec<String>,
230
231 /// Age in days since first commit touching this file
232 #[serde(default, skip_serializing_if = "Option::is_none")]
233 pub age_days: Option<u32>,
234}
235
236/// Context information extracted from the chunk for better retrieval
237///
238/// This metadata improves RAG recall by providing natural language descriptions,
239/// signatures for type matching, and relationship information.
240#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
241pub struct ChunkContext {
242 /// Extracted docstring (for natural language retrieval)
243 #[serde(skip_serializing_if = "Option::is_none")]
244 pub docstring: Option<String>,
245
246 /// Extracted comments within the chunk
247 #[serde(skip_serializing_if = "Vec::is_empty", default)]
248 pub comments: Vec<String>,
249
250 /// Function/class signature (always included, even in split parts)
251 #[serde(skip_serializing_if = "Option::is_none")]
252 pub signature: Option<String>,
253
254 /// Symbols this chunk calls
255 #[serde(skip_serializing_if = "Vec::is_empty", default)]
256 pub calls: Vec<String>,
257
258 /// Symbols that call this chunk
259 #[serde(skip_serializing_if = "Vec::is_empty", default)]
260 pub called_by: Vec<String>,
261
262 /// Import dependencies
263 #[serde(skip_serializing_if = "Vec::is_empty", default)]
264 pub imports: Vec<String>,
265
266 /// Auto-generated semantic tags
267 #[serde(skip_serializing_if = "Vec::is_empty", default)]
268 pub tags: Vec<String>,
269
270 // === RAG Retrieval Enhancements ===
271 /// Top keywords extracted from chunk content for BM25/sparse retrieval.
272 /// Identifiers are split on non-alphanumeric boundaries, filtered for stopwords,
273 /// and ranked by frequency (top 10).
274 #[serde(skip_serializing_if = "Vec::is_empty", default)]
275 pub keywords: Vec<String>,
276
277 /// Brief description of where the chunk fits in the codebase.
278 /// Generated from file path + parent context, e.g. "From src/auth.rs, in class AuthService:"
279 #[serde(skip_serializing_if = "Option::is_none", default)]
280 pub context_prefix: Option<String>,
281
282 /// Natural language summary for improved semantic search.
283 /// Generated from docstring (first line) or heuristic template.
284 #[serde(default, skip_serializing_if = "Option::is_none")]
285 pub summary: Option<String>,
286 /// Fully qualified calls resolved via import scope.
287 /// Each entry is a qualified name like "crate::auth::jwt::verify_token" or "auth.jwt::verify".
288 /// Populated by the import resolver for Rust, TypeScript, and Python files.
289 #[serde(default, skip_serializing_if = "Vec::is_empty")]
290 pub qualified_calls: Vec<String>,
291
292 /// Calls that could not be resolved via imports or same-file symbols.
293 /// These are raw call names that had no matching import or local definition.
294 #[serde(default, skip_serializing_if = "Vec::is_empty")]
295 pub unresolved_calls: Vec<String>,
296 /// Space-separated string of all unique identifiers extracted from the chunk,
297 /// optimized for BM25/sparse text indexing. Includes both original identifiers
298 /// and their camelCase/snake_case split parts, all lowercased.
299 /// Language keywords and single-character identifiers are filtered out.
300 #[serde(default, skip_serializing_if = "Option::is_none")]
301 pub identifiers: Option<String>,
302
303 /// Full type signature: "fn verify_token(token: &str) -> Result<Claims, AuthError>"
304 #[serde(default, skip_serializing_if = "Option::is_none")]
305 pub type_signature: Option<String>,
306
307 /// Individual parameter types: ["i32", "&str"]
308 #[serde(default, skip_serializing_if = "Vec::is_empty")]
309 pub parameter_types: Vec<String>,
310
311 /// Return type: "Result<Claims, AuthError>"
312 #[serde(default, skip_serializing_if = "Option::is_none")]
313 pub return_type: Option<String>,
314
315 /// Error/exception types: ["AuthError"]
316 #[serde(default, skip_serializing_if = "Vec::is_empty")]
317 pub error_types: Vec<String>,
318 // === Complexity Metrics ===
319 // These enable filtering by code complexity in RAG applications
320 /// Lines of code in this chunk (excluding blank lines and comments)
321 /// Useful for filtering out trivial one-liners vs substantial implementations
322 #[serde(skip_serializing_if = "is_zero", default)]
323 pub lines_of_code: u32,
324
325 /// Maximum nesting depth (control flow, blocks)
326 /// Higher values indicate more complex logic; useful for prioritizing review
327 #[serde(skip_serializing_if = "is_zero", default)]
328 pub max_nesting_depth: u32,
329 // === Git Metadata ===
330 /// Git version control metadata (change frequency, authors, last modified)
331 /// Only populated when `EmbedSettings::git_metadata` is enabled
332 #[serde(default, skip_serializing_if = "Option::is_none")]
333 pub git: Option<GitMetadata>,
334 /// Cyclomatic complexity score (1 + number of branch points)
335 /// Computed via Tree-sitter AST analysis of the chunk's source code.
336 /// A score of 1 means linear code with no branching.
337 /// Higher scores indicate more complex control flow; useful for filtering
338 /// and prioritizing code review in RAG applications.
339 #[serde(default, skip_serializing_if = "Option::is_none")]
340 pub complexity_score: Option<u32>,
341
342 /// Number of symbols that call/depend on this chunk
343 /// Derived from the bidirectional call graph (called_by.len())
344 #[serde(default, skip_serializing_if = "Option::is_none")]
345 pub dependents_count: Option<u32>,
346}
347
348/// Helper for serde skip_serializing_if
349fn is_zero(n: &u32) -> bool {
350 *n == 0
351}
352
353/// Default value for hierarchy_min_children (for serde)
354fn default_hierarchy_min_children() -> usize {
355 2
356}
357
358/// Default value for batch_size (for serde)
359fn default_batch_size() -> usize {
360 500
361}
362
363/// Kind of code symbol represented by a chunk
364#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
365#[serde(rename_all = "snake_case")]
366pub enum ChunkKind {
367 #[default]
368 Function,
369 Method,
370 Class,
371 Struct,
372 Enum,
373 Interface,
374 Trait,
375 Module,
376 Constant,
377 Variable,
378 Imports,
379 TopLevel,
380 FunctionPart,
381 ClassPart,
382}
383
384impl ChunkKind {
385 /// Get human-readable name for the chunk kind
386 pub fn name(&self) -> &'static str {
387 match self {
388 Self::Function => "function",
389 Self::Method => "method",
390 Self::Class => "class",
391 Self::Struct => "struct",
392 Self::Enum => "enum",
393 Self::Interface => "interface",
394 Self::Trait => "trait",
395 Self::Module => "module",
396 Self::Constant => "constant",
397 Self::Variable => "variable",
398 Self::Imports => "imports",
399 Self::TopLevel => "top_level",
400 Self::FunctionPart => "function_part",
401 Self::ClassPart => "class_part",
402 }
403 }
404
405 /// Check if this is a partial chunk (split from a larger symbol)
406 pub fn is_part(&self) -> bool {
407 matches!(self, Self::FunctionPart | Self::ClassPart)
408 }
409}
410
411/// Visibility modifier for symbols
412#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
413#[serde(rename_all = "snake_case")]
414pub enum Visibility {
415 #[default]
416 Public,
417 Private,
418 Protected,
419 Internal,
420}
421
422impl Visibility {
423 /// Get the visibility name
424 pub fn name(&self) -> &'static str {
425 match self {
426 Self::Public => "public",
427 Self::Private => "private",
428 Self::Protected => "protected",
429 Self::Internal => "internal",
430 }
431 }
432}
433
434/// Information about a chunk that was split from a larger symbol
435#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
436pub struct ChunkPart {
437 /// Part number (1-indexed)
438 pub part: u32,
439
440 /// Total number of parts
441 pub of: u32,
442
443 /// ID of the logical parent (full symbol hash)
444 pub parent_id: String,
445
446 /// Signature repeated for context
447 pub parent_signature: String,
448
449 /// Number of overlapping lines from the previous chunk (for context continuity)
450 /// This is 0 for the first part, and > 0 for subsequent parts when overlap is enabled.
451 #[serde(skip_serializing_if = "is_zero", default)]
452 pub overlap_lines: u32,
453}
454
455/// Settings that control chunk generation
456///
457/// These settings affect the output of chunk generation. Changing settings
458/// will result in different chunk IDs, so the manifest tracks settings
459/// to detect when a full rebuild is needed.
460#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
461pub struct EmbedSettings {
462 /// Maximum tokens per chunk (default: 1000 for code models)
463 pub max_tokens: u32,
464
465 /// Minimum tokens per chunk (smaller merged, default: 50)
466 pub min_tokens: u32,
467
468 /// Overlap tokens between sequential chunks (default: 100)
469 pub overlap_tokens: u32,
470
471 /// Lines of context around symbols (default: 5)
472 pub context_lines: u32,
473
474 /// Include import statements as separate chunks
475 pub include_imports: bool,
476
477 /// Include top-level code outside symbols
478 pub include_top_level: bool,
479
480 /// Token counting model
481 pub token_model: String,
482
483 /// Version of chunking algorithm (for compatibility)
484 pub algorithm_version: u32,
485
486 /// Enable secret scanning
487 pub scan_secrets: bool,
488
489 /// Fail if secrets detected (CI mode)
490 pub fail_on_secrets: bool,
491
492 /// Redact detected secrets
493 pub redact_secrets: bool,
494
495 /// Include glob patterns (e.g., ["*.rs", "src/**"])
496 /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
497 #[serde(default)]
498 pub include_patterns: Vec<String>,
499
500 /// Exclude glob patterns (e.g., ["tests/*", "*.test.*"])
501 /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
502 #[serde(default)]
503 pub exclude_patterns: Vec<String>,
504
505 /// Include test files (default: false)
506 #[serde(default)]
507 pub include_tests: bool,
508
509 /// Generate signature-only chunks alongside full code chunks
510 ///
511 /// When enabled, each code chunk that has a signature in its context will
512 /// produce an additional compact signature-only chunk. This enables tiered
513 /// retrieval: search signatures broadly, then fetch full code for top matches.
514 ///
515 /// Signature chunks have `repr: "signature"` and link back to the code chunk
516 /// via the `code_chunk_id` field.
517 #[serde(default)]
518 pub include_signatures: bool,
519
520 /// Enable hierarchical chunking for improved RAG recall
521 ///
522 /// When enabled, generates summary chunks for container types (classes, structs)
523 /// that list their children with signatures and brief descriptions. This enables
524 /// RAG systems to retrieve both high-level overviews and specific implementations.
525 ///
526 /// Recommended for object-oriented codebases (Java, Python, TypeScript).
527 #[serde(default)]
528 pub enable_hierarchy: bool,
529
530 /// Minimum number of children required to generate a summary chunk
531 /// (default: 2, only relevant when enable_hierarchy is true)
532 #[serde(default = "default_hierarchy_min_children")]
533 pub hierarchy_min_children: usize,
534
535 /// Enrich chunks with git metadata (change frequency, authors, last modified)
536 /// Requires the repository to be a git repository. Disabled by default.
537 #[serde(default)]
538 pub git_metadata: bool,
539
540 /// Repository namespace for cross-repository identity (e.g., "github.com/myorg")
541 /// Used to prefix FQNs and populate RepoIdentifier on generated chunks.
542 #[serde(default)]
543 pub repo_namespace: Option<String>,
544
545 /// Repository name override (e.g., "auth-service")
546 /// If not set, defaults to the directory name of the repository root.
547 #[serde(default)]
548 pub repo_name: Option<String>,
549
550 /// Enable streaming output mode for memory-efficient large repo processing
551 ///
552 /// When enabled, files are processed in batches and chunks are written to the
553 /// output as they are generated, rather than collecting all chunks into memory
554 /// first. This reduces peak memory from O(all chunks) to O(batch size).
555 ///
556 /// Trade-offs vs non-streaming mode:
557 /// - `called_by` is populated within each batch only, not globally
558 /// - Ordering is deterministic within batches but not globally sorted across
559 /// batch boundaries (files within each batch are sorted, and batches are
560 /// processed in lexicographic file order)
561 #[serde(default)]
562 pub streaming: bool,
563
564 /// Number of files to process per batch in streaming mode (default: 500)
565 ///
566 /// Larger batches improve `called_by` coverage and reduce overhead, but use
567 /// more memory. Only relevant when `streaming` is true.
568 #[serde(default = "default_batch_size")]
569 pub batch_size: usize,
570}
571
572impl Default for EmbedSettings {
573 fn default() -> Self {
574 Self {
575 max_tokens: 1000, // Optimized for code embedding models
576 min_tokens: 50, // Minimum meaningful chunk size
577 overlap_tokens: 100, // Context continuity between chunks
578 context_lines: 5, // Capture docstrings above functions
579 include_imports: true, // Track dependencies
580 include_top_level: true, // Include module-level code
581 token_model: "claude".to_owned(),
582 algorithm_version: 1,
583 scan_secrets: true, // Safe default
584 fail_on_secrets: false,
585 redact_secrets: true, // Safe default
586 include_patterns: Vec::new(),
587 exclude_patterns: Vec::new(),
588 include_tests: false,
589 include_signatures: false, // Off by default for backward compatibility
590 enable_hierarchy: false, // Off by default for backward compatibility
591 hierarchy_min_children: 2, // Minimum children for summary generation
592 git_metadata: false, // Off by default (requires git repo)
593 repo_namespace: None,
594 repo_name: None,
595 streaming: false, // Off by default for full determinism
596 batch_size: 500, // Files per batch in streaming mode
597 }
598 }
599}
600
601impl EmbedSettings {
602 /// Current algorithm version
603 pub const CURRENT_ALGORITHM_VERSION: u32 = 1;
604
605 /// Maximum tokens limit (DoS protection)
606 pub const MAX_TOKENS_LIMIT: u32 = 100_000;
607
608 /// Get recommended settings for specific embedding model
609 ///
610 /// Different embedding models have different optimal chunk sizes:
611 /// - voyage-code-2/3: 1500 tokens (large context window)
612 /// - cohere-embed-v3: 400 tokens (smaller model)
613 /// - openai-text-embedding-3: 800 tokens (balanced)
614 /// - sentence-transformers: 384 tokens (BERT-based)
615 pub fn for_embedding_model(model: &str) -> Self {
616 let mut settings = Self::default();
617 settings.max_tokens = match model.to_lowercase().as_str() {
618 "voyage-code-2" | "voyage-code-3" => 1500,
619 "cohere-embed-v3" | "cohere" => 400,
620 "openai-text-embedding-3-small" | "openai-text-embedding-3-large" | "openai" => 800,
621 "sentence-transformers" | "all-minilm" | "minilm" => 384,
622 _ => 1000, // Default for most code models
623 };
624 settings
625 }
626
627 /// Validate settings, return error if invalid
628 pub fn validate(&self) -> Result<(), EmbedError> {
629 if self.max_tokens > Self::MAX_TOKENS_LIMIT {
630 return Err(EmbedError::InvalidSettings {
631 field: "max_tokens".to_owned(),
632 reason: format!("exceeds limit of {}", Self::MAX_TOKENS_LIMIT),
633 });
634 }
635 if self.min_tokens > self.max_tokens {
636 return Err(EmbedError::InvalidSettings {
637 field: "min_tokens".to_owned(),
638 reason: "cannot exceed max_tokens".to_owned(),
639 });
640 }
641 if self.algorithm_version > Self::CURRENT_ALGORITHM_VERSION {
642 return Err(EmbedError::UnsupportedAlgorithmVersion {
643 found: self.algorithm_version,
644 max_supported: Self::CURRENT_ALGORITHM_VERSION,
645 });
646 }
647 Ok(())
648 }
649
650 /// Create settings optimized for CI/CD pipelines
651 ///
652 /// These settings fail on secrets and use stricter validation.
653 pub fn for_ci() -> Self {
654 Self {
655 fail_on_secrets: true,
656 scan_secrets: true,
657 redact_secrets: false, // Fail instead of redact
658 ..Self::default()
659 }
660 }
661}
662
663/// Convert from the parser's SymbolKind to our ChunkKind
664impl From<crate::types::SymbolKind> for ChunkKind {
665 fn from(kind: crate::types::SymbolKind) -> Self {
666 match kind {
667 crate::types::SymbolKind::Function => ChunkKind::Function,
668 crate::types::SymbolKind::Method => ChunkKind::Method,
669 crate::types::SymbolKind::Class => ChunkKind::Class,
670 crate::types::SymbolKind::Struct => ChunkKind::Struct,
671 crate::types::SymbolKind::Enum => ChunkKind::Enum,
672 crate::types::SymbolKind::Interface => ChunkKind::Interface,
673 crate::types::SymbolKind::Trait => ChunkKind::Trait,
674 crate::types::SymbolKind::Import => ChunkKind::Imports,
675 crate::types::SymbolKind::Constant => ChunkKind::Constant,
676 crate::types::SymbolKind::Variable => ChunkKind::Variable,
677 crate::types::SymbolKind::TypeAlias => ChunkKind::Struct, // Map type aliases to struct
678 crate::types::SymbolKind::Export => ChunkKind::Imports, // Map exports to imports
679 crate::types::SymbolKind::Module => ChunkKind::Module,
680 crate::types::SymbolKind::Macro => ChunkKind::Function, // Map macros to functions
681 }
682 }
683}
684
685/// Convert from the parser's Visibility to our Visibility
686impl From<crate::types::Visibility> for Visibility {
687 fn from(vis: crate::types::Visibility) -> Self {
688 match vis {
689 crate::types::Visibility::Public => Visibility::Public,
690 crate::types::Visibility::Private => Visibility::Private,
691 crate::types::Visibility::Protected => Visibility::Protected,
692 crate::types::Visibility::Internal => Visibility::Internal,
693 }
694 }
695}
696
697#[cfg(test)]
698mod tests {
699 use super::*;
700
701 #[test]
702 fn test_default_settings() {
703 let settings = EmbedSettings::default();
704 assert_eq!(settings.max_tokens, 1000);
705 assert_eq!(settings.min_tokens, 50);
706 assert_eq!(settings.overlap_tokens, 100);
707 assert!(settings.scan_secrets);
708 }
709
710 #[test]
711 fn test_validate_settings() {
712 let mut settings = EmbedSettings::default();
713 assert!(settings.validate().is_ok());
714
715 // Invalid: max_tokens too large
716 settings.max_tokens = 200_000;
717 assert!(settings.validate().is_err());
718
719 // Invalid: min > max
720 settings.max_tokens = 100;
721 settings.min_tokens = 200;
722 assert!(settings.validate().is_err());
723 }
724
725 #[test]
726 fn test_for_embedding_model() {
727 let voyage = EmbedSettings::for_embedding_model("voyage-code-2");
728 assert_eq!(voyage.max_tokens, 1500);
729
730 let cohere = EmbedSettings::for_embedding_model("cohere");
731 assert_eq!(cohere.max_tokens, 400);
732
733 let unknown = EmbedSettings::for_embedding_model("unknown-model");
734 assert_eq!(unknown.max_tokens, 1000);
735 }
736
737 #[test]
738 fn test_chunk_kind_name() {
739 assert_eq!(ChunkKind::Function.name(), "function");
740 assert_eq!(ChunkKind::FunctionPart.name(), "function_part");
741 }
742
743 #[test]
744 fn test_chunk_kind_is_part() {
745 assert!(ChunkKind::FunctionPart.is_part());
746 assert!(ChunkKind::ClassPart.is_part());
747 assert!(!ChunkKind::Function.is_part());
748 }
749
750 #[test]
751 fn test_visibility_name() {
752 assert_eq!(Visibility::Public.name(), "public");
753 assert_eq!(Visibility::Private.name(), "private");
754 }
755
756 #[test]
757 fn test_settings_serialization() {
758 let settings = EmbedSettings::default();
759 let json = serde_json::to_string(&settings).unwrap();
760 let deserialized: EmbedSettings = serde_json::from_str(&json).unwrap();
761 assert_eq!(settings, deserialized);
762 }
763
764 #[test]
765 fn test_ci_settings() {
766 let ci = EmbedSettings::for_ci();
767 assert!(ci.fail_on_secrets);
768 assert!(ci.scan_secrets);
769 assert!(!ci.redact_secrets);
770 }
771}