infiniloom_engine/embedding/
types.rs

1//! Core types for embedding chunk generation
2//!
3//! This module defines the data structures used throughout the embedding system,
4//! including chunks, settings, and metadata types.
5
6use serde::{Deserialize, Serialize};
7
8use super::error::EmbedError;
9
10/// Repository identifier for multi-tenant RAG systems
11///
12/// This enables embedding multiple codebases into a single vector database
13/// while maintaining clear isolation and traceability. Essential for:
14/// - Multi-repository search with proper attribution
15/// - Access control based on repository ownership
16/// - Cross-repository dependency tracking
17/// - Audit trails for compliance (SOC2, GDPR)
18///
19/// # Example
20///
21/// ```
22/// use infiniloom_engine::embedding::RepoIdentifier;
23///
24/// let repo = RepoIdentifier {
25///     namespace: "github.com/myorg".to_string(),
26///     name: "auth-service".to_string(),
27///     version: Some("v2.1.0".to_string()),
28///     branch: Some("main".to_string()),
29///     commit: Some("abc123def".to_string()),
30/// };
31/// ```
32#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
33pub struct RepoIdentifier {
34    /// Namespace/organization (e.g., "github.com/myorg", "gitlab.com/team")
35    /// Used for grouping and access control
36    pub namespace: String,
37
38    /// Repository name (e.g., "auth-service", "frontend")
39    pub name: String,
40
41    /// Semantic version or tag (e.g., "v2.1.0", "release-2024.01")
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub version: Option<String>,
44
45    /// Branch name (e.g., "main", "feature/new-auth")
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub branch: Option<String>,
48
49    /// Git commit hash (short or full)
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub commit: Option<String>,
52}
53
54impl RepoIdentifier {
55    /// Create a new repository identifier
56    pub fn new(namespace: impl Into<String>, name: impl Into<String>) -> Self {
57        Self {
58            namespace: namespace.into(),
59            name: name.into(),
60            version: None,
61            branch: None,
62            commit: None,
63        }
64    }
65
66    /// Create with full details including version and commit
67    pub fn full(
68        namespace: impl Into<String>,
69        name: impl Into<String>,
70        version: Option<String>,
71        branch: Option<String>,
72        commit: Option<String>,
73    ) -> Self {
74        Self { namespace: namespace.into(), name: name.into(), version, branch, commit }
75    }
76
77    /// Get fully qualified repository name (namespace/name)
78    pub fn qualified_name(&self) -> String {
79        if self.namespace.is_empty() {
80            self.name.clone()
81        } else {
82            format!("{}/{}", self.namespace, self.name)
83        }
84    }
85
86    /// Check if this identifier represents the same repository (ignores version/commit)
87    pub fn same_repo(&self, other: &Self) -> bool {
88        self.namespace == other.namespace && self.name == other.name
89    }
90}
91
92/// A single embedding chunk with stable, content-addressable ID
93///
94/// Each chunk represents a semantic unit of code (function, class, etc.) with
95/// a deterministic ID derived from its normalized content. This enables:
96/// - Cross-repository deduplication (same code = same ID)
97/// - Incremental updates (compare IDs to detect changes)
98/// - Stable references for vector databases
99#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
100pub struct EmbedChunk {
101    /// Content-addressable ID: BLAKE3 hash of normalized content
102    /// Format: "ec_" + 32 hex chars (128 bits) - collision-resistant for enterprise scale
103    pub id: String,
104
105    /// Full 256-bit hash for collision verification
106    pub full_hash: String,
107
108    /// The actual code content (normalized)
109    pub content: String,
110
111    /// Token count for the target model
112    pub tokens: u32,
113
114    /// Symbol kind
115    pub kind: ChunkKind,
116
117    /// Source location metadata
118    pub source: ChunkSource,
119
120    /// Enriched context for better retrieval
121    pub context: ChunkContext,
122
123    /// For split chunks: part N of M
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub part: Option<ChunkPart>,
126}
127
128/// Source location metadata for a chunk
129///
130/// This metadata helps identify where the chunk originated, but importantly
131/// does NOT affect the chunk ID (which is based solely on content).
132#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
133pub struct ChunkSource {
134    /// Repository identifier for multi-tenant RAG
135    /// Essential for distinguishing chunks from different codebases
136    #[serde(default, skip_serializing_if = "is_default_repo")]
137    pub repo: RepoIdentifier,
138
139    /// Relative file path (from repo root, never absolute)
140    pub file: String,
141
142    /// Line range (1-indexed, inclusive)
143    pub lines: (u32, u32),
144
145    /// Symbol name
146    pub symbol: String,
147
148    /// Fully qualified name
149    #[serde(skip_serializing_if = "Option::is_none")]
150    pub fqn: Option<String>,
151
152    /// Programming language
153    pub language: String,
154
155    /// Parent symbol (for methods inside classes)
156    #[serde(skip_serializing_if = "Option::is_none")]
157    pub parent: Option<String>,
158
159    /// Visibility modifier
160    pub visibility: Visibility,
161
162    /// Whether this is test code
163    pub is_test: bool,
164}
165
166/// Helper for skip_serializing_if - skip if repo is default (empty)
167fn is_default_repo(repo: &RepoIdentifier) -> bool {
168    repo.namespace.is_empty() && repo.name.is_empty()
169}
170
171/// Context information extracted from the chunk for better retrieval
172///
173/// This metadata improves RAG recall by providing natural language descriptions,
174/// signatures for type matching, and relationship information.
175#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
176pub struct ChunkContext {
177    /// Extracted docstring (for natural language retrieval)
178    #[serde(skip_serializing_if = "Option::is_none")]
179    pub docstring: Option<String>,
180
181    /// Extracted comments within the chunk
182    #[serde(skip_serializing_if = "Vec::is_empty", default)]
183    pub comments: Vec<String>,
184
185    /// Function/class signature (always included, even in split parts)
186    #[serde(skip_serializing_if = "Option::is_none")]
187    pub signature: Option<String>,
188
189    /// Symbols this chunk calls
190    #[serde(skip_serializing_if = "Vec::is_empty", default)]
191    pub calls: Vec<String>,
192
193    /// Symbols that call this chunk
194    #[serde(skip_serializing_if = "Vec::is_empty", default)]
195    pub called_by: Vec<String>,
196
197    /// Import dependencies
198    #[serde(skip_serializing_if = "Vec::is_empty", default)]
199    pub imports: Vec<String>,
200
201    /// Auto-generated semantic tags
202    #[serde(skip_serializing_if = "Vec::is_empty", default)]
203    pub tags: Vec<String>,
204
205    // === Complexity Metrics ===
206    // These enable filtering by code complexity in RAG applications
207    /// Lines of code in this chunk (excluding blank lines and comments)
208    /// Useful for filtering out trivial one-liners vs substantial implementations
209    #[serde(skip_serializing_if = "is_zero", default)]
210    pub lines_of_code: u32,
211
212    /// Maximum nesting depth (control flow, blocks)
213    /// Higher values indicate more complex logic; useful for prioritizing review
214    #[serde(skip_serializing_if = "is_zero", default)]
215    pub max_nesting_depth: u32,
216}
217
218/// Helper for serde skip_serializing_if
219fn is_zero(n: &u32) -> bool {
220    *n == 0
221}
222
223/// Default value for hierarchy_min_children (for serde)
224fn default_hierarchy_min_children() -> usize {
225    2
226}
227
228/// Kind of code symbol represented by a chunk
229#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
230#[serde(rename_all = "snake_case")]
231pub enum ChunkKind {
232    #[default]
233    Function,
234    Method,
235    Class,
236    Struct,
237    Enum,
238    Interface,
239    Trait,
240    Module,
241    Constant,
242    Variable,
243    Imports,
244    TopLevel,
245    FunctionPart,
246    ClassPart,
247}
248
249impl ChunkKind {
250    /// Get human-readable name for the chunk kind
251    pub fn name(&self) -> &'static str {
252        match self {
253            Self::Function => "function",
254            Self::Method => "method",
255            Self::Class => "class",
256            Self::Struct => "struct",
257            Self::Enum => "enum",
258            Self::Interface => "interface",
259            Self::Trait => "trait",
260            Self::Module => "module",
261            Self::Constant => "constant",
262            Self::Variable => "variable",
263            Self::Imports => "imports",
264            Self::TopLevel => "top_level",
265            Self::FunctionPart => "function_part",
266            Self::ClassPart => "class_part",
267        }
268    }
269
270    /// Check if this is a partial chunk (split from a larger symbol)
271    pub fn is_part(&self) -> bool {
272        matches!(self, Self::FunctionPart | Self::ClassPart)
273    }
274}
275
276/// Visibility modifier for symbols
277#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
278#[serde(rename_all = "snake_case")]
279pub enum Visibility {
280    #[default]
281    Public,
282    Private,
283    Protected,
284    Internal,
285}
286
287impl Visibility {
288    /// Get the visibility name
289    pub fn name(&self) -> &'static str {
290        match self {
291            Self::Public => "public",
292            Self::Private => "private",
293            Self::Protected => "protected",
294            Self::Internal => "internal",
295        }
296    }
297}
298
299/// Information about a chunk that was split from a larger symbol
300#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
301pub struct ChunkPart {
302    /// Part number (1-indexed)
303    pub part: u32,
304
305    /// Total number of parts
306    pub of: u32,
307
308    /// ID of the logical parent (full symbol hash)
309    pub parent_id: String,
310
311    /// Signature repeated for context
312    pub parent_signature: String,
313
314    /// Number of overlapping lines from the previous chunk (for context continuity)
315    /// This is 0 for the first part, and > 0 for subsequent parts when overlap is enabled.
316    #[serde(skip_serializing_if = "is_zero", default)]
317    pub overlap_lines: u32,
318}
319
320/// Settings that control chunk generation
321///
322/// These settings affect the output of chunk generation. Changing settings
323/// will result in different chunk IDs, so the manifest tracks settings
324/// to detect when a full rebuild is needed.
325#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
326pub struct EmbedSettings {
327    /// Maximum tokens per chunk (default: 1000 for code models)
328    pub max_tokens: u32,
329
330    /// Minimum tokens per chunk (smaller merged, default: 50)
331    pub min_tokens: u32,
332
333    /// Overlap tokens between sequential chunks (default: 100)
334    pub overlap_tokens: u32,
335
336    /// Lines of context around symbols (default: 5)
337    pub context_lines: u32,
338
339    /// Include import statements as separate chunks
340    pub include_imports: bool,
341
342    /// Include top-level code outside symbols
343    pub include_top_level: bool,
344
345    /// Token counting model
346    pub token_model: String,
347
348    /// Version of chunking algorithm (for compatibility)
349    pub algorithm_version: u32,
350
351    /// Enable secret scanning
352    pub scan_secrets: bool,
353
354    /// Fail if secrets detected (CI mode)
355    pub fail_on_secrets: bool,
356
357    /// Redact detected secrets
358    pub redact_secrets: bool,
359
360    /// Include glob patterns (e.g., ["*.rs", "src/**"])
361    /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
362    #[serde(default)]
363    pub include_patterns: Vec<String>,
364
365    /// Exclude glob patterns (e.g., ["tests/*", "*.test.*"])
366    /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
367    #[serde(default)]
368    pub exclude_patterns: Vec<String>,
369
370    /// Include test files (default: false)
371    #[serde(default)]
372    pub include_tests: bool,
373
374    /// Enable hierarchical chunking for improved RAG recall
375    ///
376    /// When enabled, generates summary chunks for container types (classes, structs)
377    /// that list their children with signatures and brief descriptions. This enables
378    /// RAG systems to retrieve both high-level overviews and specific implementations.
379    ///
380    /// Recommended for object-oriented codebases (Java, Python, TypeScript).
381    #[serde(default)]
382    pub enable_hierarchy: bool,
383
384    /// Minimum number of children required to generate a summary chunk
385    /// (default: 2, only relevant when enable_hierarchy is true)
386    #[serde(default = "default_hierarchy_min_children")]
387    pub hierarchy_min_children: usize,
388}
389
390impl Default for EmbedSettings {
391    fn default() -> Self {
392        Self {
393            max_tokens: 1000,        // Optimized for code embedding models
394            min_tokens: 50,          // Minimum meaningful chunk size
395            overlap_tokens: 100,     // Context continuity between chunks
396            context_lines: 5,        // Capture docstrings above functions
397            include_imports: true,   // Track dependencies
398            include_top_level: true, // Include module-level code
399            token_model: "claude".to_owned(),
400            algorithm_version: 1,
401            scan_secrets: true, // Safe default
402            fail_on_secrets: false,
403            redact_secrets: true, // Safe default
404            include_patterns: Vec::new(),
405            exclude_patterns: Vec::new(),
406            include_tests: false,
407            enable_hierarchy: false, // Off by default for backward compatibility
408            hierarchy_min_children: 2, // Minimum children for summary generation
409        }
410    }
411}
412
413impl EmbedSettings {
414    /// Current algorithm version
415    pub const CURRENT_ALGORITHM_VERSION: u32 = 1;
416
417    /// Maximum tokens limit (DoS protection)
418    pub const MAX_TOKENS_LIMIT: u32 = 100_000;
419
420    /// Get recommended settings for specific embedding model
421    ///
422    /// Different embedding models have different optimal chunk sizes:
423    /// - voyage-code-2/3: 1500 tokens (large context window)
424    /// - cohere-embed-v3: 400 tokens (smaller model)
425    /// - openai-text-embedding-3: 800 tokens (balanced)
426    /// - sentence-transformers: 384 tokens (BERT-based)
427    pub fn for_embedding_model(model: &str) -> Self {
428        let mut settings = Self::default();
429        settings.max_tokens = match model.to_lowercase().as_str() {
430            "voyage-code-2" | "voyage-code-3" => 1500,
431            "cohere-embed-v3" | "cohere" => 400,
432            "openai-text-embedding-3-small" | "openai-text-embedding-3-large" | "openai" => 800,
433            "sentence-transformers" | "all-minilm" | "minilm" => 384,
434            _ => 1000, // Default for most code models
435        };
436        settings
437    }
438
439    /// Validate settings, return error if invalid
440    pub fn validate(&self) -> Result<(), EmbedError> {
441        if self.max_tokens > Self::MAX_TOKENS_LIMIT {
442            return Err(EmbedError::InvalidSettings {
443                field: "max_tokens".to_owned(),
444                reason: format!("exceeds limit of {}", Self::MAX_TOKENS_LIMIT),
445            });
446        }
447        if self.min_tokens > self.max_tokens {
448            return Err(EmbedError::InvalidSettings {
449                field: "min_tokens".to_owned(),
450                reason: "cannot exceed max_tokens".to_owned(),
451            });
452        }
453        if self.algorithm_version > Self::CURRENT_ALGORITHM_VERSION {
454            return Err(EmbedError::UnsupportedAlgorithmVersion {
455                found: self.algorithm_version,
456                max_supported: Self::CURRENT_ALGORITHM_VERSION,
457            });
458        }
459        Ok(())
460    }
461
462    /// Create settings optimized for CI/CD pipelines
463    ///
464    /// These settings fail on secrets and use stricter validation.
465    pub fn for_ci() -> Self {
466        Self {
467            fail_on_secrets: true,
468            scan_secrets: true,
469            redact_secrets: false, // Fail instead of redact
470            ..Self::default()
471        }
472    }
473}
474
475/// Convert from the parser's SymbolKind to our ChunkKind
476impl From<crate::types::SymbolKind> for ChunkKind {
477    fn from(kind: crate::types::SymbolKind) -> Self {
478        match kind {
479            crate::types::SymbolKind::Function => ChunkKind::Function,
480            crate::types::SymbolKind::Method => ChunkKind::Method,
481            crate::types::SymbolKind::Class => ChunkKind::Class,
482            crate::types::SymbolKind::Struct => ChunkKind::Struct,
483            crate::types::SymbolKind::Enum => ChunkKind::Enum,
484            crate::types::SymbolKind::Interface => ChunkKind::Interface,
485            crate::types::SymbolKind::Trait => ChunkKind::Trait,
486            crate::types::SymbolKind::Import => ChunkKind::Imports,
487            crate::types::SymbolKind::Constant => ChunkKind::Constant,
488            crate::types::SymbolKind::Variable => ChunkKind::Variable,
489            crate::types::SymbolKind::TypeAlias => ChunkKind::Struct, // Map type aliases to struct
490            crate::types::SymbolKind::Export => ChunkKind::Imports,   // Map exports to imports
491            crate::types::SymbolKind::Module => ChunkKind::Module,
492            crate::types::SymbolKind::Macro => ChunkKind::Function, // Map macros to functions
493        }
494    }
495}
496
497/// Convert from the parser's Visibility to our Visibility
498impl From<crate::types::Visibility> for Visibility {
499    fn from(vis: crate::types::Visibility) -> Self {
500        match vis {
501            crate::types::Visibility::Public => Visibility::Public,
502            crate::types::Visibility::Private => Visibility::Private,
503            crate::types::Visibility::Protected => Visibility::Protected,
504            crate::types::Visibility::Internal => Visibility::Internal,
505        }
506    }
507}
508
509#[cfg(test)]
510mod tests {
511    use super::*;
512
513    #[test]
514    fn test_default_settings() {
515        let settings = EmbedSettings::default();
516        assert_eq!(settings.max_tokens, 1000);
517        assert_eq!(settings.min_tokens, 50);
518        assert_eq!(settings.overlap_tokens, 100);
519        assert!(settings.scan_secrets);
520    }
521
522    #[test]
523    fn test_validate_settings() {
524        let mut settings = EmbedSettings::default();
525        assert!(settings.validate().is_ok());
526
527        // Invalid: max_tokens too large
528        settings.max_tokens = 200_000;
529        assert!(settings.validate().is_err());
530
531        // Invalid: min > max
532        settings.max_tokens = 100;
533        settings.min_tokens = 200;
534        assert!(settings.validate().is_err());
535    }
536
537    #[test]
538    fn test_for_embedding_model() {
539        let voyage = EmbedSettings::for_embedding_model("voyage-code-2");
540        assert_eq!(voyage.max_tokens, 1500);
541
542        let cohere = EmbedSettings::for_embedding_model("cohere");
543        assert_eq!(cohere.max_tokens, 400);
544
545        let unknown = EmbedSettings::for_embedding_model("unknown-model");
546        assert_eq!(unknown.max_tokens, 1000);
547    }
548
549    #[test]
550    fn test_chunk_kind_name() {
551        assert_eq!(ChunkKind::Function.name(), "function");
552        assert_eq!(ChunkKind::FunctionPart.name(), "function_part");
553    }
554
555    #[test]
556    fn test_chunk_kind_is_part() {
557        assert!(ChunkKind::FunctionPart.is_part());
558        assert!(ChunkKind::ClassPart.is_part());
559        assert!(!ChunkKind::Function.is_part());
560    }
561
562    #[test]
563    fn test_visibility_name() {
564        assert_eq!(Visibility::Public.name(), "public");
565        assert_eq!(Visibility::Private.name(), "private");
566    }
567
568    #[test]
569    fn test_settings_serialization() {
570        let settings = EmbedSettings::default();
571        let json = serde_json::to_string(&settings).unwrap();
572        let deserialized: EmbedSettings = serde_json::from_str(&json).unwrap();
573        assert_eq!(settings, deserialized);
574    }
575
576    #[test]
577    fn test_ci_settings() {
578        let ci = EmbedSettings::for_ci();
579        assert!(ci.fail_on_secrets);
580        assert!(ci.scan_secrets);
581        assert!(!ci.redact_secrets);
582    }
583}