infiniloom_engine/embedding/
types.rs

1//! Core types for embedding chunk generation
2//!
3//! This module defines the data structures used throughout the embedding system,
4//! including chunks, settings, and metadata types.
5
6use serde::{Deserialize, Serialize};
7
8use super::error::EmbedError;
9
10/// Repository identifier for multi-tenant RAG systems
11///
12/// This enables embedding multiple codebases into a single vector database
13/// while maintaining clear isolation and traceability. Essential for:
14/// - Multi-repository search with proper attribution
15/// - Access control based on repository ownership
16/// - Cross-repository dependency tracking
17/// - Audit trails for compliance (SOC2, GDPR)
18///
19/// # Example
20///
21/// ```
22/// use infiniloom_engine::embedding::RepoIdentifier;
23///
24/// let repo = RepoIdentifier {
25///     namespace: "github.com/myorg".to_string(),
26///     name: "auth-service".to_string(),
27///     version: Some("v2.1.0".to_string()),
28///     branch: Some("main".to_string()),
29///     commit: Some("abc123def".to_string()),
30/// };
31/// ```
32#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
33pub struct RepoIdentifier {
34    /// Namespace/organization (e.g., "github.com/myorg", "gitlab.com/team")
35    /// Used for grouping and access control
36    pub namespace: String,
37
38    /// Repository name (e.g., "auth-service", "frontend")
39    pub name: String,
40
41    /// Semantic version or tag (e.g., "v2.1.0", "release-2024.01")
42    #[serde(skip_serializing_if = "Option::is_none")]
43    pub version: Option<String>,
44
45    /// Branch name (e.g., "main", "feature/new-auth")
46    #[serde(skip_serializing_if = "Option::is_none")]
47    pub branch: Option<String>,
48
49    /// Git commit hash (short or full)
50    #[serde(skip_serializing_if = "Option::is_none")]
51    pub commit: Option<String>,
52}
53
54impl RepoIdentifier {
55    /// Create a new repository identifier
56    pub fn new(namespace: impl Into<String>, name: impl Into<String>) -> Self {
57        Self {
58            namespace: namespace.into(),
59            name: name.into(),
60            version: None,
61            branch: None,
62            commit: None,
63        }
64    }
65
66    /// Create with full details including version and commit
67    pub fn full(
68        namespace: impl Into<String>,
69        name: impl Into<String>,
70        version: Option<String>,
71        branch: Option<String>,
72        commit: Option<String>,
73    ) -> Self {
74        Self {
75            namespace: namespace.into(),
76            name: name.into(),
77            version,
78            branch,
79            commit,
80        }
81    }
82
83    /// Get fully qualified repository name (namespace/name)
84    pub fn qualified_name(&self) -> String {
85        if self.namespace.is_empty() {
86            self.name.clone()
87        } else {
88            format!("{}/{}", self.namespace, self.name)
89        }
90    }
91
92    /// Check if this identifier represents the same repository (ignores version/commit)
93    pub fn same_repo(&self, other: &Self) -> bool {
94        self.namespace == other.namespace && self.name == other.name
95    }
96}
97
98/// A single embedding chunk with stable, content-addressable ID
99///
100/// Each chunk represents a semantic unit of code (function, class, etc.) with
101/// a deterministic ID derived from its normalized content. This enables:
102/// - Cross-repository deduplication (same code = same ID)
103/// - Incremental updates (compare IDs to detect changes)
104/// - Stable references for vector databases
105#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
106pub struct EmbedChunk {
107    /// Content-addressable ID: BLAKE3 hash of normalized content
108    /// Format: "ec_" + 32 hex chars (128 bits) - collision-resistant for enterprise scale
109    pub id: String,
110
111    /// Full 256-bit hash for collision verification
112    pub full_hash: String,
113
114    /// The actual code content (normalized)
115    pub content: String,
116
117    /// Token count for the target model
118    pub tokens: u32,
119
120    /// Symbol kind
121    pub kind: ChunkKind,
122
123    /// Source location metadata
124    pub source: ChunkSource,
125
126    /// Enriched context for better retrieval
127    pub context: ChunkContext,
128
129    /// For split chunks: part N of M
130    #[serde(skip_serializing_if = "Option::is_none")]
131    pub part: Option<ChunkPart>,
132}
133
134/// Source location metadata for a chunk
135///
136/// This metadata helps identify where the chunk originated, but importantly
137/// does NOT affect the chunk ID (which is based solely on content).
138#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
139pub struct ChunkSource {
140    /// Repository identifier for multi-tenant RAG
141    /// Essential for distinguishing chunks from different codebases
142    #[serde(default, skip_serializing_if = "is_default_repo")]
143    pub repo: RepoIdentifier,
144
145    /// Relative file path (from repo root, never absolute)
146    pub file: String,
147
148    /// Line range (1-indexed, inclusive)
149    pub lines: (u32, u32),
150
151    /// Symbol name
152    pub symbol: String,
153
154    /// Fully qualified name
155    #[serde(skip_serializing_if = "Option::is_none")]
156    pub fqn: Option<String>,
157
158    /// Programming language
159    pub language: String,
160
161    /// Parent symbol (for methods inside classes)
162    #[serde(skip_serializing_if = "Option::is_none")]
163    pub parent: Option<String>,
164
165    /// Visibility modifier
166    pub visibility: Visibility,
167
168    /// Whether this is test code
169    pub is_test: bool,
170}
171
172/// Helper for skip_serializing_if - skip if repo is default (empty)
173fn is_default_repo(repo: &RepoIdentifier) -> bool {
174    repo.namespace.is_empty() && repo.name.is_empty()
175}
176
177/// Context information extracted from the chunk for better retrieval
178///
179/// This metadata improves RAG recall by providing natural language descriptions,
180/// signatures for type matching, and relationship information.
181#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
182pub struct ChunkContext {
183    /// Extracted docstring (for natural language retrieval)
184    #[serde(skip_serializing_if = "Option::is_none")]
185    pub docstring: Option<String>,
186
187    /// Extracted comments within the chunk
188    #[serde(skip_serializing_if = "Vec::is_empty", default)]
189    pub comments: Vec<String>,
190
191    /// Function/class signature (always included, even in split parts)
192    #[serde(skip_serializing_if = "Option::is_none")]
193    pub signature: Option<String>,
194
195    /// Symbols this chunk calls
196    #[serde(skip_serializing_if = "Vec::is_empty", default)]
197    pub calls: Vec<String>,
198
199    /// Symbols that call this chunk
200    #[serde(skip_serializing_if = "Vec::is_empty", default)]
201    pub called_by: Vec<String>,
202
203    /// Import dependencies
204    #[serde(skip_serializing_if = "Vec::is_empty", default)]
205    pub imports: Vec<String>,
206
207    /// Auto-generated semantic tags
208    #[serde(skip_serializing_if = "Vec::is_empty", default)]
209    pub tags: Vec<String>,
210
211    // === Complexity Metrics ===
212    // These enable filtering by code complexity in RAG applications
213
214    /// Lines of code in this chunk (excluding blank lines and comments)
215    /// Useful for filtering out trivial one-liners vs substantial implementations
216    #[serde(skip_serializing_if = "is_zero", default)]
217    pub lines_of_code: u32,
218
219    /// Maximum nesting depth (control flow, blocks)
220    /// Higher values indicate more complex logic; useful for prioritizing review
221    #[serde(skip_serializing_if = "is_zero", default)]
222    pub max_nesting_depth: u32,
223}
224
225/// Helper for serde skip_serializing_if
226fn is_zero(n: &u32) -> bool {
227    *n == 0
228}
229
230/// Default value for hierarchy_min_children (for serde)
231fn default_hierarchy_min_children() -> usize {
232    2
233}
234
235/// Kind of code symbol represented by a chunk
236#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
237#[serde(rename_all = "snake_case")]
238pub enum ChunkKind {
239    #[default]
240    Function,
241    Method,
242    Class,
243    Struct,
244    Enum,
245    Interface,
246    Trait,
247    Module,
248    Constant,
249    Variable,
250    Imports,
251    TopLevel,
252    FunctionPart,
253    ClassPart,
254}
255
256impl ChunkKind {
257    /// Get human-readable name for the chunk kind
258    pub fn name(&self) -> &'static str {
259        match self {
260            Self::Function => "function",
261            Self::Method => "method",
262            Self::Class => "class",
263            Self::Struct => "struct",
264            Self::Enum => "enum",
265            Self::Interface => "interface",
266            Self::Trait => "trait",
267            Self::Module => "module",
268            Self::Constant => "constant",
269            Self::Variable => "variable",
270            Self::Imports => "imports",
271            Self::TopLevel => "top_level",
272            Self::FunctionPart => "function_part",
273            Self::ClassPart => "class_part",
274        }
275    }
276
277    /// Check if this is a partial chunk (split from a larger symbol)
278    pub fn is_part(&self) -> bool {
279        matches!(self, Self::FunctionPart | Self::ClassPart)
280    }
281}
282
283/// Visibility modifier for symbols
284#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
285#[serde(rename_all = "snake_case")]
286pub enum Visibility {
287    #[default]
288    Public,
289    Private,
290    Protected,
291    Internal,
292}
293
294impl Visibility {
295    /// Get the visibility name
296    pub fn name(&self) -> &'static str {
297        match self {
298            Self::Public => "public",
299            Self::Private => "private",
300            Self::Protected => "protected",
301            Self::Internal => "internal",
302        }
303    }
304}
305
306/// Information about a chunk that was split from a larger symbol
307#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
308pub struct ChunkPart {
309    /// Part number (1-indexed)
310    pub part: u32,
311
312    /// Total number of parts
313    pub of: u32,
314
315    /// ID of the logical parent (full symbol hash)
316    pub parent_id: String,
317
318    /// Signature repeated for context
319    pub parent_signature: String,
320
321    /// Number of overlapping lines from the previous chunk (for context continuity)
322    /// This is 0 for the first part, and > 0 for subsequent parts when overlap is enabled.
323    #[serde(skip_serializing_if = "is_zero", default)]
324    pub overlap_lines: u32,
325}
326
327/// Settings that control chunk generation
328///
329/// These settings affect the output of chunk generation. Changing settings
330/// will result in different chunk IDs, so the manifest tracks settings
331/// to detect when a full rebuild is needed.
332#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
333pub struct EmbedSettings {
334    /// Maximum tokens per chunk (default: 1000 for code models)
335    pub max_tokens: u32,
336
337    /// Minimum tokens per chunk (smaller merged, default: 50)
338    pub min_tokens: u32,
339
340    /// Overlap tokens between sequential chunks (default: 100)
341    pub overlap_tokens: u32,
342
343    /// Lines of context around symbols (default: 5)
344    pub context_lines: u32,
345
346    /// Include import statements as separate chunks
347    pub include_imports: bool,
348
349    /// Include top-level code outside symbols
350    pub include_top_level: bool,
351
352    /// Token counting model
353    pub token_model: String,
354
355    /// Version of chunking algorithm (for compatibility)
356    pub algorithm_version: u32,
357
358    /// Enable secret scanning
359    pub scan_secrets: bool,
360
361    /// Fail if secrets detected (CI mode)
362    pub fail_on_secrets: bool,
363
364    /// Redact detected secrets
365    pub redact_secrets: bool,
366
367    /// Include glob patterns (e.g., ["*.rs", "src/**"])
368    /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
369    #[serde(default)]
370    pub include_patterns: Vec<String>,
371
372    /// Exclude glob patterns (e.g., ["tests/*", "*.test.*"])
373    /// Note: skip_serializing_if removed for bincode compatibility (requires all fields)
374    #[serde(default)]
375    pub exclude_patterns: Vec<String>,
376
377    /// Include test files (default: false)
378    #[serde(default)]
379    pub include_tests: bool,
380
381    /// Enable hierarchical chunking for improved RAG recall
382    ///
383    /// When enabled, generates summary chunks for container types (classes, structs)
384    /// that list their children with signatures and brief descriptions. This enables
385    /// RAG systems to retrieve both high-level overviews and specific implementations.
386    ///
387    /// Recommended for object-oriented codebases (Java, Python, TypeScript).
388    #[serde(default)]
389    pub enable_hierarchy: bool,
390
391    /// Minimum number of children required to generate a summary chunk
392    /// (default: 2, only relevant when enable_hierarchy is true)
393    #[serde(default = "default_hierarchy_min_children")]
394    pub hierarchy_min_children: usize,
395}
396
397impl Default for EmbedSettings {
398    fn default() -> Self {
399        Self {
400            max_tokens: 1000,        // Optimized for code embedding models
401            min_tokens: 50,          // Minimum meaningful chunk size
402            overlap_tokens: 100,     // Context continuity between chunks
403            context_lines: 5,        // Capture docstrings above functions
404            include_imports: true,   // Track dependencies
405            include_top_level: true, // Include module-level code
406            token_model: "claude".to_string(),
407            algorithm_version: 1,
408            scan_secrets: true,  // Safe default
409            fail_on_secrets: false,
410            redact_secrets: true, // Safe default
411            include_patterns: Vec::new(),
412            exclude_patterns: Vec::new(),
413            include_tests: false,
414            enable_hierarchy: false, // Off by default for backward compatibility
415            hierarchy_min_children: 2, // Minimum children for summary generation
416        }
417    }
418}
419
420impl EmbedSettings {
421    /// Current algorithm version
422    pub const CURRENT_ALGORITHM_VERSION: u32 = 1;
423
424    /// Maximum tokens limit (DoS protection)
425    pub const MAX_TOKENS_LIMIT: u32 = 100_000;
426
427    /// Get recommended settings for specific embedding model
428    ///
429    /// Different embedding models have different optimal chunk sizes:
430    /// - voyage-code-2/3: 1500 tokens (large context window)
431    /// - cohere-embed-v3: 400 tokens (smaller model)
432    /// - openai-text-embedding-3: 800 tokens (balanced)
433    /// - sentence-transformers: 384 tokens (BERT-based)
434    pub fn for_embedding_model(model: &str) -> Self {
435        let mut settings = Self::default();
436        settings.max_tokens = match model.to_lowercase().as_str() {
437            "voyage-code-2" | "voyage-code-3" => 1500,
438            "cohere-embed-v3" | "cohere" => 400,
439            "openai-text-embedding-3-small" | "openai-text-embedding-3-large" | "openai" => 800,
440            "sentence-transformers" | "all-minilm" | "minilm" => 384,
441            _ => 1000, // Default for most code models
442        };
443        settings
444    }
445
446    /// Validate settings, return error if invalid
447    pub fn validate(&self) -> Result<(), EmbedError> {
448        if self.max_tokens > Self::MAX_TOKENS_LIMIT {
449            return Err(EmbedError::InvalidSettings {
450                field: "max_tokens".to_string(),
451                reason: format!("exceeds limit of {}", Self::MAX_TOKENS_LIMIT),
452            });
453        }
454        if self.min_tokens > self.max_tokens {
455            return Err(EmbedError::InvalidSettings {
456                field: "min_tokens".to_string(),
457                reason: "cannot exceed max_tokens".to_string(),
458            });
459        }
460        if self.algorithm_version > Self::CURRENT_ALGORITHM_VERSION {
461            return Err(EmbedError::UnsupportedAlgorithmVersion {
462                found: self.algorithm_version,
463                max_supported: Self::CURRENT_ALGORITHM_VERSION,
464            });
465        }
466        Ok(())
467    }
468
469    /// Create settings optimized for CI/CD pipelines
470    ///
471    /// These settings fail on secrets and use stricter validation.
472    pub fn for_ci() -> Self {
473        Self {
474            fail_on_secrets: true,
475            scan_secrets: true,
476            redact_secrets: false, // Fail instead of redact
477            ..Self::default()
478        }
479    }
480}
481
482/// Convert from the parser's SymbolKind to our ChunkKind
483impl From<crate::types::SymbolKind> for ChunkKind {
484    fn from(kind: crate::types::SymbolKind) -> Self {
485        match kind {
486            crate::types::SymbolKind::Function => ChunkKind::Function,
487            crate::types::SymbolKind::Method => ChunkKind::Method,
488            crate::types::SymbolKind::Class => ChunkKind::Class,
489            crate::types::SymbolKind::Struct => ChunkKind::Struct,
490            crate::types::SymbolKind::Enum => ChunkKind::Enum,
491            crate::types::SymbolKind::Interface => ChunkKind::Interface,
492            crate::types::SymbolKind::Trait => ChunkKind::Trait,
493            crate::types::SymbolKind::Import => ChunkKind::Imports,
494            crate::types::SymbolKind::Constant => ChunkKind::Constant,
495            crate::types::SymbolKind::Variable => ChunkKind::Variable,
496            crate::types::SymbolKind::TypeAlias => ChunkKind::Struct, // Map type aliases to struct
497            crate::types::SymbolKind::Export => ChunkKind::Imports, // Map exports to imports
498            crate::types::SymbolKind::Module => ChunkKind::Module,
499            crate::types::SymbolKind::Macro => ChunkKind::Function, // Map macros to functions
500        }
501    }
502}
503
504/// Convert from the parser's Visibility to our Visibility
505impl From<crate::types::Visibility> for Visibility {
506    fn from(vis: crate::types::Visibility) -> Self {
507        match vis {
508            crate::types::Visibility::Public => Visibility::Public,
509            crate::types::Visibility::Private => Visibility::Private,
510            crate::types::Visibility::Protected => Visibility::Protected,
511            crate::types::Visibility::Internal => Visibility::Internal,
512        }
513    }
514}
515
516#[cfg(test)]
517mod tests {
518    use super::*;
519
520    #[test]
521    fn test_default_settings() {
522        let settings = EmbedSettings::default();
523        assert_eq!(settings.max_tokens, 1000);
524        assert_eq!(settings.min_tokens, 50);
525        assert_eq!(settings.overlap_tokens, 100);
526        assert!(settings.scan_secrets);
527    }
528
529    #[test]
530    fn test_validate_settings() {
531        let mut settings = EmbedSettings::default();
532        assert!(settings.validate().is_ok());
533
534        // Invalid: max_tokens too large
535        settings.max_tokens = 200_000;
536        assert!(settings.validate().is_err());
537
538        // Invalid: min > max
539        settings.max_tokens = 100;
540        settings.min_tokens = 200;
541        assert!(settings.validate().is_err());
542    }
543
544    #[test]
545    fn test_for_embedding_model() {
546        let voyage = EmbedSettings::for_embedding_model("voyage-code-2");
547        assert_eq!(voyage.max_tokens, 1500);
548
549        let cohere = EmbedSettings::for_embedding_model("cohere");
550        assert_eq!(cohere.max_tokens, 400);
551
552        let unknown = EmbedSettings::for_embedding_model("unknown-model");
553        assert_eq!(unknown.max_tokens, 1000);
554    }
555
556    #[test]
557    fn test_chunk_kind_name() {
558        assert_eq!(ChunkKind::Function.name(), "function");
559        assert_eq!(ChunkKind::FunctionPart.name(), "function_part");
560    }
561
562    #[test]
563    fn test_chunk_kind_is_part() {
564        assert!(ChunkKind::FunctionPart.is_part());
565        assert!(ChunkKind::ClassPart.is_part());
566        assert!(!ChunkKind::Function.is_part());
567    }
568
569    #[test]
570    fn test_visibility_name() {
571        assert_eq!(Visibility::Public.name(), "public");
572        assert_eq!(Visibility::Private.name(), "private");
573    }
574
575    #[test]
576    fn test_settings_serialization() {
577        let settings = EmbedSettings::default();
578        let json = serde_json::to_string(&settings).unwrap();
579        let deserialized: EmbedSettings = serde_json::from_str(&json).unwrap();
580        assert_eq!(settings, deserialized);
581    }
582
583    #[test]
584    fn test_ci_settings() {
585        let ci = EmbedSettings::for_ci();
586        assert!(ci.fail_on_secrets);
587        assert!(ci.scan_secrets);
588        assert!(!ci.redact_secrets);
589    }
590}