infiniloom_engine/embedding/
types.rs

1//! Core types for embedding chunk generation
2//!
3//! This module defines the data structures used throughout the embedding system,
4//! including chunks, settings, and metadata types.
5
6use serde::{Deserialize, Serialize};
7
8use super::error::EmbedError;
9
10/// Repository identifier for multi-tenant RAG systems
11///
12/// This enables embedding multiple codebases into a single vector database
13/// while maintaining clear isolation and traceability. Essential for:
14/// - Multi-repository search with proper attribution
15/// - Access control based on repository ownership
16/// - Cross-repository dependency tracking
17/// - Audit trails for compliance (SOC2, GDPR)
18///
19/// # Example
20///
21/// ```
22/// use infiniloom_engine::embedding::RepoIdentifier;
23///
24/// let repo = RepoIdentifier {
25///     namespace: Some("github.com/myorg".to_string()),
26///     name: "auth-service".to_string(),
27///     version: Some("v2.1.0".to_string()),
28///     branch: Some("main".to_string()),
29///     commit: Some("abc123def".to_string()),
30/// };
31/// ```
32#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
33pub struct RepoIdentifier {
34    /// Namespace/organization (e.g., "github.com/myorg", "gitlab.com/team")
35    /// Used for grouping and access control
36    pub namespace: Option<String>,
37
38    /// Repository name (e.g., "auth-service", "frontend")
39    pub name: String,
40
41    /// Semantic version or tag (e.g., "v2.1.0", "release-2024.01")
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub version: Option<String>,
44
45    /// Branch name (e.g., "main", "feature/new-auth")
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub branch: Option<String>,
48
49    /// Git commit hash (short or full)
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub commit: Option<String>,
52}
53
54impl RepoIdentifier {
55    /// Create a new repository identifier
56    pub fn new(namespace: impl Into<String>, name: impl Into<String>) -> Self {
57        let ns: String = namespace.into();
58        Self {
59            namespace: if ns.is_empty() { None } else { Some(ns) },
60            name: name.into(),
61            version: None,
62            branch: None,
63            commit: None,
64        }
65    }
66
67    /// Create with full details including version and commit
68    pub fn full(
69        namespace: Option<String>,
70        name: impl Into<String>,
71        version: Option<String>,
72        branch: Option<String>,
73        commit: Option<String>,
74    ) -> Self {
75        Self { namespace, name: name.into(), version, branch, commit }
76    }
77
78    /// Get fully qualified repository name (namespace/name)
79    pub fn qualified_name(&self) -> String {
80        match &self.namespace {
81            Some(ns) if !ns.is_empty() => format!("{}/{}", ns, self.name),
82            _ => self.name.clone(),
83        }
84    }
85
86    /// Check if this identifier represents the same repository (ignores version/commit)
87    pub fn same_repo(&self, other: &Self) -> bool {
88        self.namespace == other.namespace && self.name == other.name
89    }
90}
91
92/// A single embedding chunk with stable, content-addressable ID
93///
94/// Each chunk represents a semantic unit of code (function, class, etc.) with
95/// a deterministic ID derived from its normalized content. This enables:
96/// - Cross-repository deduplication (same code = same ID)
97/// - Incremental updates (compare IDs to detect changes)
98/// - Stable references for vector databases
99#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
100pub struct EmbedChunk {
101    /// Content-addressable ID: BLAKE3 hash of normalized content
102    /// Format: "ec_" + 32 hex chars (128 bits) - collision-resistant for enterprise scale
103    pub id: String,
104
105    /// Full 256-bit hash for collision verification
106    #[serde(default)]
107    pub full_hash: String,
108
109    /// The actual code content (normalized)
110    pub content: String,
111
112    /// Token count for the target model
113    pub tokens: u32,
114
115    /// Symbol kind
116    pub kind: ChunkKind,
117
118    /// Source location metadata
119    pub source: ChunkSource,
120
121    /// Enriched context for better retrieval
122    pub context: ChunkContext,
123
124    /// IDs of child chunks (methods inside a class, etc.)
125    /// Sorted for determinism. Enables hierarchical navigation in RAG systems.
126    #[serde(default, skip_serializing_if = "Vec::is_empty")]
127    pub children_ids: Vec<String>,
128
129    /// Representation type: "code" (default) or "signature"
130    ///
131    /// Code chunks contain the full implementation. Signature chunks contain only
132    /// the declaration/signature, enabling tiered retrieval: search signatures
133    /// broadly, then fetch full code for top matches.
134    #[serde(default = "default_repr")]
135    pub repr: String,
136
137    /// For non-code representations, the ID of the full code chunk
138    ///
139    /// This links a signature chunk back to its corresponding code chunk,
140    /// enabling two-phase retrieval workflows.
141    #[serde(default, skip_serializing_if = "Option::is_none")]
142    pub code_chunk_id: Option<String>,
143
144    /// For split chunks: part N of M
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub part: Option<ChunkPart>,
147}
148
149/// Default representation type for chunks
150pub(super) fn default_repr() -> String {
151    "code".to_owned()
152}
153
154/// Source location metadata for a chunk
155///
156/// This metadata helps identify where the chunk originated, but importantly
157/// does NOT affect the chunk ID (which is based solely on content).
158#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
159pub struct ChunkSource {
160    /// Repository identifier for multi-tenant RAG
161    /// Essential for distinguishing chunks from different codebases
162    #[serde(default, skip_serializing_if = "is_default_repo")]
163    pub repo: RepoIdentifier,
164
165    /// Relative file path (from repo root, never absolute)
166    pub file: String,
167
168    /// Line range (1-indexed, inclusive)
169    pub lines: (u32, u32),
170
171    /// Symbol name
172    pub symbol: String,
173
174    /// Fully qualified name
175    #[serde(default, skip_serializing_if = "Option::is_none")]
176    pub fqn: Option<String>,
177
178    /// Programming language
179    pub language: String,
180
181    /// Parent symbol (for methods inside classes)
182    #[serde(default, skip_serializing_if = "Option::is_none")]
183    pub parent: Option<String>,
184
185    /// Visibility modifier
186    pub visibility: Visibility,
187
188    /// Whether this is test code
189    #[serde(default)]
190    pub is_test: bool,
191
192    /// Module path derived from file path and language conventions
193    /// e.g., "auth::jwt" for src/auth/jwt.rs in Rust
194    #[serde(default, skip_serializing_if = "Option::is_none")]
195    pub module_path: Option<String>,
196
197    /// Chunk ID of the parent container (class/struct/enum/trait/interface)
198    /// Enables hierarchical navigation in RAG systems
199    #[serde(default, skip_serializing_if = "Option::is_none")]
200    pub parent_chunk_id: Option<String>,
201}
202
203/// Helper for skip_serializing_if - skip if repo is default (empty)
204fn is_default_repo(repo: &RepoIdentifier) -> bool {
205    repo.namespace.is_none() && repo.name.is_empty()
206}
207
208/// Git metadata for a chunk's source file
209///
210/// Enriches chunks with version control history for temporal-aware retrieval.
211/// All fields are optional to gracefully handle non-git repos, shallow clones,
212/// and untracked files.
213#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq, Eq)]
214pub struct GitMetadata {
215    /// ISO 8601 date of last modification
216    #[serde(default, skip_serializing_if = "Option::is_none")]
217    pub last_modified: Option<String>,
218
219    /// Number of commits touching this file in the lookback period (90 days)
220    #[serde(default, skip_serializing_if = "Option::is_none")]
221    pub change_frequency: Option<u32>,
222
223    /// Total commits ever touching this file
224    #[serde(default, skip_serializing_if = "Option::is_none")]
225    pub total_commits: Option<u32>,
226
227    /// Unique authors (sorted, deduplicated for determinism)
228    #[serde(default, skip_serializing_if = "Vec::is_empty")]
229    pub authors: Vec<String>,
230
231    /// Age in days since first commit touching this file
232    #[serde(default, skip_serializing_if = "Option::is_none")]
233    pub age_days: Option<u32>,
234}
235
236/// Context information extracted from the chunk for better retrieval
237///
238/// This metadata improves RAG recall by providing natural language descriptions,
239/// signatures for type matching, and relationship information.
240#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
241pub struct ChunkContext {
242    /// Extracted docstring (for natural language retrieval)
243    #[serde(skip_serializing_if = "Option::is_none")]
244    pub docstring: Option<String>,
245
246    /// Extracted comments within the chunk
247    #[serde(skip_serializing_if = "Vec::is_empty", default)]
248    pub comments: Vec<String>,
249
250    /// Function/class signature (always included, even in split parts)
251    #[serde(skip_serializing_if = "Option::is_none")]
252    pub signature: Option<String>,
253
254    /// Symbols this chunk calls
255    #[serde(skip_serializing_if = "Vec::is_empty", default)]
256    pub calls: Vec<String>,
257
258    /// Symbols that call this chunk
259    #[serde(skip_serializing_if = "Vec::is_empty", default)]
260    pub called_by: Vec<String>,
261
262    /// Import dependencies
263    #[serde(skip_serializing_if = "Vec::is_empty", default)]
264    pub imports: Vec<String>,
265
266    /// Auto-generated semantic tags
267    #[serde(skip_serializing_if = "Vec::is_empty", default)]
268    pub tags: Vec<String>,
269
270    // === RAG Retrieval Enhancements ===
271    /// Top keywords extracted from chunk content for BM25/sparse retrieval.
272    /// Identifiers are split on non-alphanumeric boundaries, filtered for stopwords,
273    /// and ranked by frequency (top 10).
274    #[serde(skip_serializing_if = "Vec::is_empty", default)]
275    pub keywords: Vec<String>,
276
277    /// Brief description of where the chunk fits in the codebase.
278    /// Generated from file path + parent context, e.g. "From src/auth.rs, in class AuthService:"
279    #[serde(skip_serializing_if = "Option::is_none", default)]
280    pub context_prefix: Option<String>,
281
282    /// Natural language summary for improved semantic search.
283    /// Generated from docstring (first line) or heuristic template.
284    #[serde(default, skip_serializing_if = "Option::is_none")]
285    pub summary: Option<String>,
286    /// Fully qualified calls resolved via import scope.
287    /// Each entry is a qualified name like "crate::auth::jwt::verify_token" or "auth.jwt::verify".
288    /// Populated by the import resolver for Rust, TypeScript, and Python files.
289    #[serde(default, skip_serializing_if = "Vec::is_empty")]
290    pub qualified_calls: Vec<String>,
291
292    /// Calls that could not be resolved via imports or same-file symbols.
293    /// These are raw call names that had no matching import or local definition.
294    #[serde(default, skip_serializing_if = "Vec::is_empty")]
295    pub unresolved_calls: Vec<String>,
296    /// Space-separated string of all unique identifiers extracted from the chunk,
297    /// optimized for BM25/sparse text indexing. Includes both original identifiers
298    /// and their camelCase/snake_case split parts, all lowercased.
299    /// Language keywords and single-character identifiers are filtered out.
300    #[serde(default, skip_serializing_if = "Option::is_none")]
301    pub identifiers: Option<String>,
302
303    /// Full type signature: "fn verify_token(token: &str) -> Result<Claims, AuthError>"
304    #[serde(default, skip_serializing_if = "Option::is_none")]
305    pub type_signature: Option<String>,
306
307    /// Individual parameter types: ["i32", "&str"]
308    #[serde(default, skip_serializing_if = "Vec::is_empty")]
309    pub parameter_types: Vec<String>,
310
311    /// Return type: "Result<Claims, AuthError>"
312    #[serde(default, skip_serializing_if = "Option::is_none")]
313    pub return_type: Option<String>,
314
315    /// Error/exception types: ["AuthError"]
316    #[serde(default, skip_serializing_if = "Vec::is_empty")]
317    pub error_types: Vec<String>,
318    // === Complexity Metrics ===
319    // These enable filtering by code complexity in RAG applications
320    /// Lines of code in this chunk (excluding blank lines and comments)
321    /// Useful for filtering out trivial one-liners vs substantial implementations
322    #[serde(skip_serializing_if = "is_zero", default)]
323    pub lines_of_code: u32,
324
325    /// Maximum nesting depth (control flow, blocks)
326    /// Higher values indicate more complex logic; useful for prioritizing review
327    #[serde(skip_serializing_if = "is_zero", default)]
328    pub max_nesting_depth: u32,
329    // === Git Metadata ===
330    /// Git version control metadata (change frequency, authors, last modified)
331    /// Only populated when `EmbedSettings::git_metadata` is enabled
332    #[serde(default, skip_serializing_if = "Option::is_none")]
333    pub git: Option<GitMetadata>,
334    /// Cyclomatic complexity score (1 + number of branch points)
335    /// Computed via Tree-sitter AST analysis of the chunk's source code.
336    /// A score of 1 means linear code with no branching.
337    /// Higher scores indicate more complex control flow; useful for filtering
338    /// and prioritizing code review in RAG applications.
339    #[serde(default, skip_serializing_if = "Option::is_none")]
340    pub complexity_score: Option<u32>,
341
342    /// Number of symbols that call/depend on this chunk
343    /// Derived from the bidirectional call graph (called_by.len())
344    #[serde(default, skip_serializing_if = "Option::is_none")]
345    pub dependents_count: Option<u32>,
346}
347
348/// Helper for serde skip_serializing_if
349fn is_zero(n: &u32) -> bool {
350    *n == 0
351}
352
353/// Default value for hierarchy_min_children (for serde)
354fn default_hierarchy_min_children() -> usize {
355    2
356}
357
358/// Default value for batch_size (for serde)
359fn default_batch_size() -> usize {
360    500
361}
362
363/// Kind of code symbol represented by a chunk
364#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
365#[serde(rename_all = "snake_case")]
366pub enum ChunkKind {
367    #[default]
368    Function,
369    Method,
370    Class,
371    Struct,
372    Enum,
373    Interface,
374    Trait,
375    Module,
376    Constant,
377    Variable,
378    Imports,
379    TopLevel,
380    FunctionPart,
381    ClassPart,
382}
383
384impl ChunkKind {
385    /// Get human-readable name for the chunk kind
386    pub fn name(&self) -> &'static str {
387        match self {
388            Self::Function => "function",
389            Self::Method => "method",
390            Self::Class => "class",
391            Self::Struct => "struct",
392            Self::Enum => "enum",
393            Self::Interface => "interface",
394            Self::Trait => "trait",
395            Self::Module => "module",
396            Self::Constant => "constant",
397            Self::Variable => "variable",
398            Self::Imports => "imports",
399            Self::TopLevel => "top_level",
400            Self::FunctionPart => "function_part",
401            Self::ClassPart => "class_part",
402        }
403    }
404
405    /// Check if this is a partial chunk (split from a larger symbol)
406    pub fn is_part(&self) -> bool {
407        matches!(self, Self::FunctionPart | Self::ClassPart)
408    }
409}
410
411/// Visibility modifier for symbols
412#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
413#[serde(rename_all = "snake_case")]
414pub enum Visibility {
415    #[default]
416    Public,
417    Private,
418    Protected,
419    Internal,
420}
421
422impl Visibility {
423    /// Get the visibility name
424    pub fn name(&self) -> &'static str {
425        match self {
426            Self::Public => "public",
427            Self::Private => "private",
428            Self::Protected => "protected",
429            Self::Internal => "internal",
430        }
431    }
432}
433
434/// Information about a chunk that was split from a larger symbol
435#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
436pub struct ChunkPart {
437    /// Part number (1-indexed)
438    pub part: u32,
439
440    /// Total number of parts
441    pub of: u32,
442
443    /// ID of the logical parent (full symbol hash)
444    pub parent_id: String,
445
446    /// Signature repeated for context
447    pub parent_signature: String,
448
449    /// Number of overlapping lines from the previous chunk (for context continuity)
450    /// This is 0 for the first part, and > 0 for subsequent parts when overlap is enabled.
451    #[serde(skip_serializing_if = "is_zero", default)]
452    pub overlap_lines: u32,
453}
454
455/// Settings that control chunk generation
456///
457/// These settings affect the output of chunk generation. Changing settings
458/// will result in different chunk IDs, so the manifest tracks settings
459/// to detect when a full rebuild is needed.
460#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
461pub struct EmbedSettings {
462    /// Maximum tokens per chunk (default: 1000 for code models)
463    pub max_tokens: u32,
464
465    /// Minimum tokens per chunk (smaller merged, default: 50)
466    pub min_tokens: u32,
467
468    /// Overlap tokens between sequential chunks (default: 100)
469    pub overlap_tokens: u32,
470
471    /// Lines of context around symbols (default: 5)
472    pub context_lines: u32,
473
474    /// Include import statements as separate chunks
475    pub include_imports: bool,
476
477    /// Include top-level code outside symbols
478    pub include_top_level: bool,
479
480    /// Token counting model
481    pub token_model: String,
482
483    /// Version of chunking algorithm (for compatibility)
484    pub algorithm_version: u32,
485
486    /// Enable secret scanning
487    pub scan_secrets: bool,
488
489    /// Fail if secrets detected (CI mode)
490    pub fail_on_secrets: bool,
491
492    /// Redact detected secrets
493    pub redact_secrets: bool,
494
495    /// Include glob patterns (e.g., ["*.rs", "src/**"])
496    /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
497    #[serde(default)]
498    pub include_patterns: Vec<String>,
499
500    /// Exclude glob patterns (e.g., ["tests/*", "*.test.*"])
501    /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
502    #[serde(default)]
503    pub exclude_patterns: Vec<String>,
504
505    /// Include test files (default: false)
506    #[serde(default)]
507    pub include_tests: bool,
508
509    /// Generate signature-only chunks alongside full code chunks
510    ///
511    /// When enabled, each code chunk that has a signature in its context will
512    /// produce an additional compact signature-only chunk. This enables tiered
513    /// retrieval: search signatures broadly, then fetch full code for top matches.
514    ///
515    /// Signature chunks have `repr: "signature"` and link back to the code chunk
516    /// via the `code_chunk_id` field.
517    #[serde(default)]
518    pub include_signatures: bool,
519
520    /// Enable hierarchical chunking for improved RAG recall
521    ///
522    /// When enabled, generates summary chunks for container types (classes, structs)
523    /// that list their children with signatures and brief descriptions. This enables
524    /// RAG systems to retrieve both high-level overviews and specific implementations.
525    ///
526    /// Recommended for object-oriented codebases (Java, Python, TypeScript).
527    #[serde(default)]
528    pub enable_hierarchy: bool,
529
530    /// Minimum number of children required to generate a summary chunk
531    /// (default: 2, only relevant when enable_hierarchy is true)
532    #[serde(default = "default_hierarchy_min_children")]
533    pub hierarchy_min_children: usize,
534
535    /// Enrich chunks with git metadata (change frequency, authors, last modified)
536    /// Requires the repository to be a git repository. Disabled by default.
537    #[serde(default)]
538    pub git_metadata: bool,
539
540    /// Repository namespace for cross-repository identity (e.g., "github.com/myorg")
541    /// Used to prefix FQNs and populate RepoIdentifier on generated chunks.
542    #[serde(default)]
543    pub repo_namespace: Option<String>,
544
545    /// Repository name override (e.g., "auth-service")
546    /// If not set, defaults to the directory name of the repository root.
547    #[serde(default)]
548    pub repo_name: Option<String>,
549
550    /// Enable streaming output mode for memory-efficient large repo processing
551    ///
552    /// When enabled, files are processed in batches and chunks are written to the
553    /// output as they are generated, rather than collecting all chunks into memory
554    /// first. This reduces peak memory from O(all chunks) to O(batch size).
555    ///
556    /// Trade-offs vs non-streaming mode:
557    /// - `called_by` is populated within each batch only, not globally
558    /// - Ordering is deterministic within batches but not globally sorted across
559    ///   batch boundaries (files within each batch are sorted, and batches are
560    ///   processed in lexicographic file order)
561    #[serde(default)]
562    pub streaming: bool,
563
564    /// Number of files to process per batch in streaming mode (default: 500)
565    ///
566    /// Larger batches improve `called_by` coverage and reduce overhead, but use
567    /// more memory. Only relevant when `streaming` is true.
568    #[serde(default = "default_batch_size")]
569    pub batch_size: usize,
570}
571
572impl Default for EmbedSettings {
573    fn default() -> Self {
574        Self {
575            max_tokens: 1000,        // Optimized for code embedding models
576            min_tokens: 50,          // Minimum meaningful chunk size
577            overlap_tokens: 100,     // Context continuity between chunks
578            context_lines: 5,        // Capture docstrings above functions
579            include_imports: true,   // Track dependencies
580            include_top_level: true, // Include module-level code
581            token_model: "claude".to_owned(),
582            algorithm_version: 1,
583            scan_secrets: true, // Safe default
584            fail_on_secrets: false,
585            redact_secrets: true, // Safe default
586            include_patterns: Vec::new(),
587            exclude_patterns: Vec::new(),
588            include_tests: false,
589            include_signatures: false, // Off by default for backward compatibility
590            enable_hierarchy: false,   // Off by default for backward compatibility
591            hierarchy_min_children: 2, // Minimum children for summary generation
592            git_metadata: false,       // Off by default (requires git repo)
593            repo_namespace: None,
594            repo_name: None,
595            streaming: false, // Off by default for full determinism
596            batch_size: 500,  // Files per batch in streaming mode
597        }
598    }
599}
600
601impl EmbedSettings {
602    /// Current algorithm version
603    pub const CURRENT_ALGORITHM_VERSION: u32 = 1;
604
605    /// Maximum tokens limit (DoS protection)
606    pub const MAX_TOKENS_LIMIT: u32 = 100_000;
607
608    /// Get recommended settings for specific embedding model
609    ///
610    /// Different embedding models have different optimal chunk sizes:
611    /// - voyage-code-2/3: 1500 tokens (large context window)
612    /// - cohere-embed-v3: 400 tokens (smaller model)
613    /// - openai-text-embedding-3: 800 tokens (balanced)
614    /// - sentence-transformers: 384 tokens (BERT-based)
615    pub fn for_embedding_model(model: &str) -> Self {
616        let mut settings = Self::default();
617        settings.max_tokens = match model.to_lowercase().as_str() {
618            "voyage-code-2" | "voyage-code-3" => 1500,
619            "cohere-embed-v3" | "cohere" => 400,
620            "openai-text-embedding-3-small" | "openai-text-embedding-3-large" | "openai" => 800,
621            "sentence-transformers" | "all-minilm" | "minilm" => 384,
622            _ => 1000, // Default for most code models
623        };
624        settings
625    }
626
627    /// Validate settings, return error if invalid
628    pub fn validate(&self) -> Result<(), EmbedError> {
629        if self.max_tokens > Self::MAX_TOKENS_LIMIT {
630            return Err(EmbedError::InvalidSettings {
631                field: "max_tokens".to_owned(),
632                reason: format!("exceeds limit of {}", Self::MAX_TOKENS_LIMIT),
633            });
634        }
635        if self.min_tokens > self.max_tokens {
636            return Err(EmbedError::InvalidSettings {
637                field: "min_tokens".to_owned(),
638                reason: "cannot exceed max_tokens".to_owned(),
639            });
640        }
641        if self.algorithm_version > Self::CURRENT_ALGORITHM_VERSION {
642            return Err(EmbedError::UnsupportedAlgorithmVersion {
643                found: self.algorithm_version,
644                max_supported: Self::CURRENT_ALGORITHM_VERSION,
645            });
646        }
647        Ok(())
648    }
649
650    /// Create settings optimized for CI/CD pipelines
651    ///
652    /// These settings fail on secrets and use stricter validation.
653    pub fn for_ci() -> Self {
654        Self {
655            fail_on_secrets: true,
656            scan_secrets: true,
657            redact_secrets: false, // Fail instead of redact
658            ..Self::default()
659        }
660    }
661}
662
663/// Convert from the parser's SymbolKind to our ChunkKind
664impl From<crate::types::SymbolKind> for ChunkKind {
665    fn from(kind: crate::types::SymbolKind) -> Self {
666        match kind {
667            crate::types::SymbolKind::Function => ChunkKind::Function,
668            crate::types::SymbolKind::Method => ChunkKind::Method,
669            crate::types::SymbolKind::Class => ChunkKind::Class,
670            crate::types::SymbolKind::Struct => ChunkKind::Struct,
671            crate::types::SymbolKind::Enum => ChunkKind::Enum,
672            crate::types::SymbolKind::Interface => ChunkKind::Interface,
673            crate::types::SymbolKind::Trait => ChunkKind::Trait,
674            crate::types::SymbolKind::Import => ChunkKind::Imports,
675            crate::types::SymbolKind::Constant => ChunkKind::Constant,
676            crate::types::SymbolKind::Variable => ChunkKind::Variable,
677            crate::types::SymbolKind::TypeAlias => ChunkKind::Struct, // Map type aliases to struct
678            crate::types::SymbolKind::Export => ChunkKind::Imports,   // Map exports to imports
679            crate::types::SymbolKind::Module => ChunkKind::Module,
680            crate::types::SymbolKind::Macro => ChunkKind::Function, // Map macros to functions
681        }
682    }
683}
684
685/// Convert from the parser's Visibility to our Visibility
686impl From<crate::types::Visibility> for Visibility {
687    fn from(vis: crate::types::Visibility) -> Self {
688        match vis {
689            crate::types::Visibility::Public => Visibility::Public,
690            crate::types::Visibility::Private => Visibility::Private,
691            crate::types::Visibility::Protected => Visibility::Protected,
692            crate::types::Visibility::Internal => Visibility::Internal,
693        }
694    }
695}
696
697#[cfg(test)]
698mod tests {
699    use super::*;
700
701    #[test]
702    fn test_default_settings() {
703        let settings = EmbedSettings::default();
704        assert_eq!(settings.max_tokens, 1000);
705        assert_eq!(settings.min_tokens, 50);
706        assert_eq!(settings.overlap_tokens, 100);
707        assert!(settings.scan_secrets);
708    }
709
710    #[test]
711    fn test_validate_settings() {
712        let mut settings = EmbedSettings::default();
713        assert!(settings.validate().is_ok());
714
715        // Invalid: max_tokens too large
716        settings.max_tokens = 200_000;
717        assert!(settings.validate().is_err());
718
719        // Invalid: min > max
720        settings.max_tokens = 100;
721        settings.min_tokens = 200;
722        assert!(settings.validate().is_err());
723    }
724
725    #[test]
726    fn test_for_embedding_model() {
727        let voyage = EmbedSettings::for_embedding_model("voyage-code-2");
728        assert_eq!(voyage.max_tokens, 1500);
729
730        let cohere = EmbedSettings::for_embedding_model("cohere");
731        assert_eq!(cohere.max_tokens, 400);
732
733        let unknown = EmbedSettings::for_embedding_model("unknown-model");
734        assert_eq!(unknown.max_tokens, 1000);
735    }
736
737    #[test]
738    fn test_chunk_kind_name() {
739        assert_eq!(ChunkKind::Function.name(), "function");
740        assert_eq!(ChunkKind::FunctionPart.name(), "function_part");
741    }
742
743    #[test]
744    fn test_chunk_kind_is_part() {
745        assert!(ChunkKind::FunctionPart.is_part());
746        assert!(ChunkKind::ClassPart.is_part());
747        assert!(!ChunkKind::Function.is_part());
748    }
749
750    #[test]
751    fn test_visibility_name() {
752        assert_eq!(Visibility::Public.name(), "public");
753        assert_eq!(Visibility::Private.name(), "private");
754    }
755
756    #[test]
757    fn test_settings_serialization() {
758        let settings = EmbedSettings::default();
759        let json = serde_json::to_string(&settings).unwrap();
760        let deserialized: EmbedSettings = serde_json::from_str(&json).unwrap();
761        assert_eq!(settings, deserialized);
762    }
763
764    #[test]
765    fn test_ci_settings() {
766        let ci = EmbedSettings::for_ci();
767        assert!(ci.fail_on_secrets);
768        assert!(ci.scan_secrets);
769        assert!(!ci.redact_secrets);
770    }
771}
infiniloom_engine/embedding/types.rs

infiniloom_engine/embedding/
types.rs