infiniloom_engine/
semantic.rs

1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35/// Result type for semantic operations
36pub type Result<T> = std::result::Result<T, SemanticError>;
37
38/// Errors that can occur during semantic operations
39#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41    #[error("Model loading failed: {0}")]
42    ModelLoadError(String),
43
44    #[error("Embedding generation failed: {0}")]
45    EmbeddingError(String),
46
47    #[error("Clustering failed: {0}")]
48    ClusteringError(String),
49
50    #[error("Feature not available: embeddings feature not enabled")]
51    FeatureNotEnabled,
52}
53
54// ============================================================================
55// Semantic Analyzer (for similarity and embeddings)
56// ============================================================================
57
58/// Semantic analyzer using code embeddings
59///
60/// When the `embeddings` feature is enabled, uses the configured model path
61/// for neural network-based embeddings. Without the feature, provides
62/// heuristic-based similarity estimates.
63#[derive(Debug)]
64pub struct SemanticAnalyzer {
65    /// Path to the embedding model (used when embeddings feature is enabled)
66    #[cfg(feature = "embeddings")]
67    model_path: Option<String>,
68    /// Placeholder for non-embeddings build (maintains API compatibility)
69    #[cfg(not(feature = "embeddings"))]
70    _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74    /// Create a new semantic analyzer
75    pub fn new() -> Self {
76        Self {
77            #[cfg(feature = "embeddings")]
78            model_path: None,
79            #[cfg(not(feature = "embeddings"))]
80            _model_path: None,
81        }
82    }
83
84    /// Create a semantic analyzer with a custom model path
85    ///
86    /// The model path is used when the `embeddings` feature is enabled.
87    /// Without the feature, the path is stored but not used.
88    pub fn with_model(model_path: &str) -> Self {
89        Self {
90            #[cfg(feature = "embeddings")]
91            model_path: Some(model_path.to_owned()),
92            #[cfg(not(feature = "embeddings"))]
93            _model_path: Some(model_path.to_owned()),
94        }
95    }
96
97    /// Get the configured model path (if any)
98    #[cfg(feature = "embeddings")]
99    pub fn model_path(&self) -> Option<&str> {
100        self.model_path.as_deref()
101    }
102
103    /// Generate embeddings for code content
104    ///
105    /// # Current Implementation
106    ///
107    /// Uses a character-frequency based embedding algorithm that:
108    /// 1. Creates a 384-dimensional vector (matching common transformer output size)
109    /// 2. Accumulates weighted character frequencies based on position
110    /// 3. Normalizes to unit length for cosine similarity
111    ///
112    /// This is a **lightweight placeholder** that provides reasonable similarity
113    /// estimates for code without requiring ML model dependencies. It captures:
114    /// - Character distribution patterns
115    /// - Position-weighted frequency (earlier chars weighted more)
116    /// - Basic structural patterns through punctuation distribution
117    ///
118    /// For production use cases requiring high accuracy, consider integrating
119    /// actual transformer embeddings.
120    #[cfg(feature = "embeddings")]
121    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122        // Character-frequency based embedding (see doc comment for rationale)
123        let mut embedding = vec![0.0f32; 384];
124        for (i, c) in content.chars().enumerate() {
125            let idx = (c as usize) % 384;
126            // Position-weighted contribution: earlier characters contribute more
127            embedding[idx] += 1.0 / ((i + 1) as f32);
128        }
129        // L2 normalize for cosine similarity
130        let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131        if norm > 0.0 {
132            for x in &mut embedding {
133                *x /= norm;
134            }
135        }
136        Ok(embedding)
137    }
138
139    /// Generate embeddings (stub when feature disabled)
140    #[cfg(not(feature = "embeddings"))]
141    pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142        Ok(vec![0.0; 384])
143    }
144
145    /// Calculate similarity between two code snippets
146    #[cfg(feature = "embeddings")]
147    pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148        let emb_a = self.embed(a)?;
149        let emb_b = self.embed(b)?;
150        Ok(cosine_similarity(&emb_a, &emb_b))
151    }
152
153    /// Calculate similarity (stub when feature disabled)
154    #[cfg(not(feature = "embeddings"))]
155    pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156        Ok(0.0)
157    }
158}
159
160impl Default for SemanticAnalyzer {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166// ============================================================================
167// Semantic Compressor (for reducing content while preserving meaning)
168// ============================================================================
169
170/// Configuration for semantic compression
171#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173    /// Similarity threshold for clustering (0.0 - 1.0)
174    pub similarity_threshold: f32,
175    /// Minimum chunk size in characters
176    pub min_chunk_size: usize,
177    /// Maximum chunk size in characters
178    pub max_chunk_size: usize,
179    /// Budget ratio (0.0 - 1.0) - target size relative to original
180    pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184    fn default() -> Self {
185        Self {
186            similarity_threshold: 0.7,
187            min_chunk_size: 100,
188            max_chunk_size: 2000,
189            budget_ratio: 0.5,
190        }
191    }
192}
193
194/// A chunk of code
195#[derive(Debug, Clone)]
196pub struct CodeChunk {
197    /// The original content
198    pub content: String,
199    /// Start offset in original content
200    pub start: usize,
201    /// End offset in original content
202    pub end: usize,
203    /// Embedding vector (when computed)
204    pub embedding: Option<Vec<f32>>,
205    /// Cluster assignment
206    pub cluster_id: Option<usize>,
207}
208
209/// Semantic compressor for code content
210///
211/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
212/// otherwise falls back to heuristic-based compression.
213pub struct SemanticCompressor {
214    config: SemanticConfig,
215    /// Semantic analyzer for generating embeddings and computing similarity
216    analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220    /// Create a new semantic compressor with default config
221    pub fn new() -> Self {
222        Self::with_config(SemanticConfig::default())
223    }
224
225    /// Create a new semantic compressor with custom config
226    pub fn with_config(config: SemanticConfig) -> Self {
227        Self { config, analyzer: SemanticAnalyzer::new() }
228    }
229
230    /// Get a reference to the internal semantic analyzer
231    ///
232    /// This allows access to the analyzer for similarity computations
233    /// or custom embedding operations.
234    pub fn analyzer(&self) -> &SemanticAnalyzer {
235        &self.analyzer
236    }
237
238    /// Compress content semantically
239    ///
240    /// When the `embeddings` feature is enabled, uses neural embeddings
241    /// to cluster similar code chunks and select representatives.
242    ///
243    /// Without the feature, falls back to heuristic-based compression.
244    pub fn compress(&self, content: &str) -> Result<String> {
245        #[cfg(feature = "embeddings")]
246        {
247            return self.compress_with_embeddings(content);
248        }
249
250        #[cfg(not(feature = "embeddings"))]
251        {
252            self.compress_heuristic(content)
253        }
254    }
255
256    /// Split content into semantic chunks (Bug #6 fix - handles content without \n\n)
257    fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
258        let mut chunks = Vec::new();
259        let mut current_start = 0;
260
261        // First try: Split on double newlines (paragraph-like boundaries)
262        for (i, _) in content.match_indices("\n\n") {
263            if i > current_start && i - current_start >= self.config.min_chunk_size {
264                let chunk_content = &content[current_start..i];
265                if chunk_content.len() <= self.config.max_chunk_size {
266                    chunks.push(CodeChunk {
267                        content: chunk_content.to_owned(),
268                        start: current_start,
269                        end: i,
270                        embedding: None,
271                        cluster_id: None,
272                    });
273                }
274                current_start = i + 2;
275            }
276        }
277
278        // Handle remaining content
279        if current_start < content.len() {
280            let remaining = &content[current_start..];
281            if remaining.len() >= self.config.min_chunk_size {
282                chunks.push(CodeChunk {
283                    content: remaining.to_owned(),
284                    start: current_start,
285                    end: content.len(),
286                    embedding: None,
287                    cluster_id: None,
288                });
289            }
290        }
291
292        // Fallback: If no chunks found (no \n\n separators), try single newlines
293        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
294            current_start = 0;
295            for (i, _) in content.match_indices('\n') {
296                if i > current_start && i - current_start >= self.config.min_chunk_size {
297                    let chunk_content = &content[current_start..i];
298                    if chunk_content.len() <= self.config.max_chunk_size {
299                        chunks.push(CodeChunk {
300                            content: chunk_content.to_owned(),
301                            start: current_start,
302                            end: i,
303                            embedding: None,
304                            cluster_id: None,
305                        });
306                    }
307                    current_start = i + 1;
308                }
309            }
310            // Handle remaining after single newline split
311            if current_start < content.len() {
312                let remaining = &content[current_start..];
313                if remaining.len() >= self.config.min_chunk_size {
314                    chunks.push(CodeChunk {
315                        content: remaining.to_owned(),
316                        start: current_start,
317                        end: content.len(),
318                        embedding: None,
319                        cluster_id: None,
320                    });
321                }
322            }
323        }
324
325        // Second fallback: If still no chunks, split by sentence boundaries (. followed by space)
326        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
327            current_start = 0;
328            for (i, _) in content.match_indices(". ") {
329                if i > current_start && i - current_start >= self.config.min_chunk_size {
330                    let chunk_content = &content[current_start..=i]; // include the period
331                    if chunk_content.len() <= self.config.max_chunk_size {
332                        chunks.push(CodeChunk {
333                            content: chunk_content.to_owned(),
334                            start: current_start,
335                            end: i + 1,
336                            embedding: None,
337                            cluster_id: None,
338                        });
339                    }
340                    current_start = i + 2;
341                }
342            }
343            // Handle remaining
344            if current_start < content.len() {
345                let remaining = &content[current_start..];
346                if remaining.len() >= self.config.min_chunk_size {
347                    chunks.push(CodeChunk {
348                        content: remaining.to_owned(),
349                        start: current_start,
350                        end: content.len(),
351                        embedding: None,
352                        cluster_id: None,
353                    });
354                }
355            }
356        }
357
358        // Final fallback: If content is large but can't be split, force split by max_chunk_size
359        if chunks.is_empty() && content.len() > self.config.max_chunk_size {
360            let mut pos = 0;
361            while pos < content.len() {
362                let end = (pos + self.config.max_chunk_size).min(content.len());
363                chunks.push(CodeChunk {
364                    content: content[pos..end].to_owned(),
365                    start: pos,
366                    end,
367                    embedding: None,
368                    cluster_id: None,
369                });
370                pos = end;
371            }
372        }
373
374        chunks
375    }
376
377    /// Compress using heuristic methods (fallback when embeddings unavailable)
378    fn compress_heuristic(&self, content: &str) -> Result<String> {
379        let chunks = self.split_into_chunks(content);
380
381        if chunks.is_empty() {
382            return Ok(content.to_owned());
383        }
384
385        // Keep every Nth chunk based on budget ratio
386        let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
387        let step = chunks.len() / target_chunks.max(1);
388
389        let mut result = String::new();
390        let mut kept = 0;
391
392        for (i, chunk) in chunks.iter().enumerate() {
393            if i % step.max(1) == 0 && kept < target_chunks {
394                if !result.is_empty() {
395                    result.push_str("\n\n");
396                }
397                result.push_str(&chunk.content);
398                kept += 1;
399            }
400        }
401
402        // Add truncation marker if we removed content
403        if kept < chunks.len() {
404            result.push_str(&format!(
405                "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
406                chunks.len() - kept,
407                (kept as f32 / chunks.len() as f32) * 100.0
408            ));
409        }
410
411        Ok(result)
412    }
413
414    /// Compress using neural embeddings
415    #[cfg(feature = "embeddings")]
416    fn compress_with_embeddings(&self, content: &str) -> Result<String> {
417        let mut chunks = self.split_into_chunks(content);
418
419        if chunks.is_empty() {
420            return Ok(content.to_owned());
421        }
422
423        // Generate embeddings for each chunk
424        for chunk in &mut chunks {
425            chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
426        }
427
428        // Cluster similar chunks
429        let clusters = self.cluster_chunks(&chunks)?;
430
431        // Select representative from each cluster
432        let mut result = String::new();
433        for cluster in clusters.values() {
434            if let Some(representative) = self.select_representative(cluster) {
435                if !result.is_empty() {
436                    result.push_str("\n\n");
437                }
438                result.push_str(&representative.content);
439            }
440        }
441
442        Ok(result)
443    }
444
445    /// Cluster chunks by embedding similarity
446    #[cfg(feature = "embeddings")]
447    fn cluster_chunks<'a>(
448        &self,
449        chunks: &'a [CodeChunk],
450    ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
451        let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
452        let mut next_cluster = 0;
453
454        for chunk in chunks {
455            let embedding = chunk
456                .embedding
457                .as_ref()
458                .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
459
460            // Find existing cluster with similar embedding
461            let mut assigned = false;
462            for (&cluster_id, cluster_chunks) in &clusters {
463                if let Some(first) = cluster_chunks.first() {
464                    if let Some(ref first_emb) = first.embedding {
465                        let similarity = cosine_similarity(embedding, first_emb);
466                        if similarity >= self.config.similarity_threshold {
467                            clusters.get_mut(&cluster_id).unwrap().push(chunk);
468                            assigned = true;
469                            break;
470                        }
471                    }
472                }
473            }
474
475            if !assigned {
476                clusters.insert(next_cluster, vec![chunk]);
477                next_cluster += 1;
478            }
479        }
480
481        Ok(clusters)
482    }
483
484    /// Select the best representative from a cluster
485    #[cfg(feature = "embeddings")]
486    fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
487        // Select the longest chunk as representative (most informative)
488        chunks.iter().max_by_key(|c| c.content.len()).copied()
489    }
490}
491
492impl Default for SemanticCompressor {
493    fn default() -> Self {
494        Self::new()
495    }
496}
497
498// ============================================================================
499// Honest Type Aliases
500// ============================================================================
501// The names below more accurately describe the implementation:
502// - "Semantic" implies neural/ML understanding, but we use heuristics
503// - These aliases are provided for clarity and recommended for new code
504
505/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
506///
507/// This analyzer uses character-frequency heuristics for similarity detection,
508/// NOT neural network embeddings. Use this alias when you want to be explicit
509/// about the implementation approach.
510pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
511
512/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
513///
514/// This compressor uses chunk-based heuristics with optional character-frequency
515/// clustering, NOT neural semantic understanding. Use this alias when you want
516/// to be explicit about the implementation approach.
517pub type HeuristicCompressor = SemanticCompressor;
518
519/// Alias for `SemanticConfig` - more honest name.
520pub type HeuristicCompressionConfig = SemanticConfig;
521
522// ============================================================================
523// Utility Functions
524// ============================================================================
525
526/// Compute cosine similarity between two vectors
527///
528/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
529/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
530/// opposite direction.
531///
532/// # Note
533/// This function is used by the embeddings feature for clustering and
534/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
535/// builds without the embeddings feature.
536#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
537fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
538    if a.len() != b.len() || a.is_empty() {
539        return 0.0;
540    }
541
542    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
543    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
544    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
545
546    if norm_a == 0.0 || norm_b == 0.0 {
547        return 0.0;
548    }
549
550    dot / (norm_a * norm_b)
551}
552
553// ============================================================================
554// Tests
555// ============================================================================
556
557#[cfg(test)]
558mod tests {
559    use super::*;
560
561    #[test]
562    fn test_analyzer_creation() {
563        let analyzer = SemanticAnalyzer::new();
564        // Verify analyzer is created successfully
565        // Model path is None by default (accessed via model_path() when embeddings enabled)
566        #[cfg(feature = "embeddings")]
567        assert!(analyzer.model_path().is_none());
568        #[cfg(not(feature = "embeddings"))]
569        drop(analyzer); // Explicitly drop to satisfy lint
570    }
571
572    #[test]
573    fn test_analyzer_with_model() {
574        let analyzer = SemanticAnalyzer::with_model("/path/to/model");
575        #[cfg(feature = "embeddings")]
576        assert_eq!(analyzer.model_path(), Some("/path/to/model"));
577        #[cfg(not(feature = "embeddings"))]
578        drop(analyzer); // Explicitly drop to satisfy lint
579    }
580
581    #[test]
582    fn test_compressor_analyzer_access() {
583        let compressor = SemanticCompressor::new();
584        // Verify we can access the analyzer through the compressor
585        let _analyzer = compressor.analyzer();
586    }
587
588    #[test]
589    fn test_semantic_config_default() {
590        let config = SemanticConfig::default();
591        assert_eq!(config.similarity_threshold, 0.7);
592        assert_eq!(config.budget_ratio, 0.5);
593    }
594
595    #[test]
596    fn test_split_into_chunks() {
597        let compressor = SemanticCompressor::with_config(SemanticConfig {
598            min_chunk_size: 10,
599            max_chunk_size: 1000,
600            ..Default::default()
601        });
602
603        let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
604        let chunks = compressor.split_into_chunks(content);
605        assert!(chunks.len() >= 2);
606    }
607
608    #[test]
609    fn test_heuristic_compression() {
610        let compressor = SemanticCompressor::with_config(SemanticConfig {
611            min_chunk_size: 5,
612            max_chunk_size: 100,
613            budget_ratio: 0.5,
614            ..Default::default()
615        });
616
617        let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
618        let result = compressor.compress_heuristic(content).unwrap();
619        // Should complete without error
620        assert!(!result.is_empty() || content.is_empty());
621    }
622
623    #[test]
624    fn test_empty_content() {
625        let compressor = SemanticCompressor::new();
626        let result = compressor.compress("").unwrap();
627        assert_eq!(result, "");
628    }
629
630    #[test]
631    fn test_cosine_similarity_identical() {
632        let a = vec![1.0, 0.0, 0.0];
633        let b = vec![1.0, 0.0, 0.0];
634        let sim = cosine_similarity(&a, &b);
635        assert!((sim - 1.0).abs() < 0.001);
636    }
637
638    #[test]
639    fn test_cosine_similarity_orthogonal() {
640        let a = vec![1.0, 0.0, 0.0];
641        let c = vec![0.0, 1.0, 0.0];
642        let sim = cosine_similarity(&a, &c);
643        assert!(sim.abs() < 0.001);
644    }
645
646    #[test]
647    fn test_cosine_similarity_empty() {
648        let a: Vec<f32> = vec![];
649        let b: Vec<f32> = vec![];
650        assert_eq!(cosine_similarity(&a, &b), 0.0);
651    }
652}