infiniloom_engine/semantic.rs
1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35/// Result type for semantic operations
36pub type Result<T> = std::result::Result<T, SemanticError>;
37
38/// Errors that can occur during semantic operations
39#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54// ============================================================================
55// Semantic Analyzer (for similarity and embeddings)
56// ============================================================================
57
58/// Semantic analyzer using code embeddings
59///
60/// When the `embeddings` feature is enabled, uses the configured model path
61/// for neural network-based embeddings. Without the feature, provides
62/// heuristic-based similarity estimates.
63#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 /// Path to the embedding model (used when embeddings feature is enabled)
66 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 /// Placeholder for non-embeddings build (maintains API compatibility)
69 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 /// Create a new semantic analyzer
75 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 /// Create a semantic analyzer with a custom model path
85 ///
86 /// The model path is used when the `embeddings` feature is enabled.
87 /// Without the feature, the path is stored but not used.
88 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 /// Get the configured model path (if any)
98 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 /// Generate embeddings for code content
104 ///
105 /// # Current Implementation
106 ///
107 /// Uses a character-frequency based embedding algorithm that:
108 /// 1. Creates a 384-dimensional vector (matching common transformer output size)
109 /// 2. Accumulates weighted character frequencies based on position
110 /// 3. Normalizes to unit length for cosine similarity
111 ///
112 /// This is a **lightweight placeholder** that provides reasonable similarity
113 /// estimates for code without requiring ML model dependencies. It captures:
114 /// - Character distribution patterns
115 /// - Position-weighted frequency (earlier chars weighted more)
116 /// - Basic structural patterns through punctuation distribution
117 ///
118 /// For production use cases requiring high accuracy, consider integrating
119 /// actual transformer embeddings.
120 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 // Character-frequency based embedding (see doc comment for rationale)
123 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 // Position-weighted contribution: earlier characters contribute more
127 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 // L2 normalize for cosine similarity
130 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 /// Generate embeddings (stub when feature disabled)
140 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 /// Calculate similarity between two code snippets
146 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 /// Calculate similarity (stub when feature disabled)
154 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166// ============================================================================
167// Semantic Compressor (for reducing content while preserving meaning)
168// ============================================================================
169
170/// Configuration for semantic compression
171#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 /// Similarity threshold for clustering (0.0 - 1.0)
174 pub similarity_threshold: f32,
175 /// Minimum chunk size in characters
176 pub min_chunk_size: usize,
177 /// Maximum chunk size in characters
178 pub max_chunk_size: usize,
179 /// Budget ratio (0.0 - 1.0) - target size relative to original
180 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194/// A chunk of code
195#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 /// The original content
198 pub content: String,
199 /// Start offset in original content
200 pub start: usize,
201 /// End offset in original content
202 pub end: usize,
203 /// Embedding vector (when computed)
204 pub embedding: Option<Vec<f32>>,
205 /// Cluster assignment
206 pub cluster_id: Option<usize>,
207}
208
209/// Semantic compressor for code content
210///
211/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
212/// otherwise falls back to heuristic-based compression.
213pub struct SemanticCompressor {
214 config: SemanticConfig,
215 /// Semantic analyzer for generating embeddings and computing similarity
216 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 /// Create a new semantic compressor with default config
221 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 /// Create a new semantic compressor with custom config
226 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 /// Get a reference to the internal semantic analyzer
231 ///
232 /// This allows access to the analyzer for similarity computations
233 /// or custom embedding operations.
234 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 /// Compress content semantically
239 ///
240 /// When the `embeddings` feature is enabled, uses neural embeddings
241 /// to cluster similar code chunks and select representatives.
242 ///
243 /// Without the feature, falls back to heuristic-based compression.
244 pub fn compress(&self, content: &str) -> Result<String> {
245 #[cfg(feature = "embeddings")]
246 {
247 return self.compress_with_embeddings(content);
248 }
249
250 #[cfg(not(feature = "embeddings"))]
251 {
252 self.compress_heuristic(content)
253 }
254 }
255
256 /// Split content into semantic chunks
257 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
258 let mut chunks = Vec::new();
259 let mut current_start = 0;
260
261 // Split on double newlines (paragraph-like boundaries)
262 for (i, _) in content.match_indices("\n\n") {
263 if i > current_start && i - current_start >= self.config.min_chunk_size {
264 let chunk_content = &content[current_start..i];
265 if chunk_content.len() <= self.config.max_chunk_size {
266 chunks.push(CodeChunk {
267 content: chunk_content.to_owned(),
268 start: current_start,
269 end: i,
270 embedding: None,
271 cluster_id: None,
272 });
273 }
274 current_start = i + 2;
275 }
276 }
277
278 // Handle remaining content
279 if current_start < content.len() {
280 let remaining = &content[current_start..];
281 if remaining.len() >= self.config.min_chunk_size {
282 chunks.push(CodeChunk {
283 content: remaining.to_owned(),
284 start: current_start,
285 end: content.len(),
286 embedding: None,
287 cluster_id: None,
288 });
289 }
290 }
291
292 chunks
293 }
294
295 /// Compress using heuristic methods (fallback when embeddings unavailable)
296 fn compress_heuristic(&self, content: &str) -> Result<String> {
297 let chunks = self.split_into_chunks(content);
298
299 if chunks.is_empty() {
300 return Ok(content.to_owned());
301 }
302
303 // Keep every Nth chunk based on budget ratio
304 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
305 let step = chunks.len() / target_chunks.max(1);
306
307 let mut result = String::new();
308 let mut kept = 0;
309
310 for (i, chunk) in chunks.iter().enumerate() {
311 if i % step.max(1) == 0 && kept < target_chunks {
312 if !result.is_empty() {
313 result.push_str("\n\n");
314 }
315 result.push_str(&chunk.content);
316 kept += 1;
317 }
318 }
319
320 // Add truncation marker if we removed content
321 if kept < chunks.len() {
322 result.push_str(&format!(
323 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
324 chunks.len() - kept,
325 (kept as f32 / chunks.len() as f32) * 100.0
326 ));
327 }
328
329 Ok(result)
330 }
331
332 /// Compress using neural embeddings
333 #[cfg(feature = "embeddings")]
334 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
335 let mut chunks = self.split_into_chunks(content);
336
337 if chunks.is_empty() {
338 return Ok(content.to_owned());
339 }
340
341 // Generate embeddings for each chunk
342 for chunk in &mut chunks {
343 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
344 }
345
346 // Cluster similar chunks
347 let clusters = self.cluster_chunks(&chunks)?;
348
349 // Select representative from each cluster
350 let mut result = String::new();
351 for cluster in clusters.values() {
352 if let Some(representative) = self.select_representative(cluster) {
353 if !result.is_empty() {
354 result.push_str("\n\n");
355 }
356 result.push_str(&representative.content);
357 }
358 }
359
360 Ok(result)
361 }
362
363 /// Cluster chunks by embedding similarity
364 #[cfg(feature = "embeddings")]
365 fn cluster_chunks<'a>(
366 &self,
367 chunks: &'a [CodeChunk],
368 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
369 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
370 let mut next_cluster = 0;
371
372 for chunk in chunks {
373 let embedding = chunk
374 .embedding
375 .as_ref()
376 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
377
378 // Find existing cluster with similar embedding
379 let mut assigned = false;
380 for (&cluster_id, cluster_chunks) in &clusters {
381 if let Some(first) = cluster_chunks.first() {
382 if let Some(ref first_emb) = first.embedding {
383 let similarity = cosine_similarity(embedding, first_emb);
384 if similarity >= self.config.similarity_threshold {
385 clusters.get_mut(&cluster_id).unwrap().push(chunk);
386 assigned = true;
387 break;
388 }
389 }
390 }
391 }
392
393 if !assigned {
394 clusters.insert(next_cluster, vec![chunk]);
395 next_cluster += 1;
396 }
397 }
398
399 Ok(clusters)
400 }
401
402 /// Select the best representative from a cluster
403 #[cfg(feature = "embeddings")]
404 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
405 // Select the longest chunk as representative (most informative)
406 chunks.iter().max_by_key(|c| c.content.len()).copied()
407 }
408}
409
410impl Default for SemanticCompressor {
411 fn default() -> Self {
412 Self::new()
413 }
414}
415
416// ============================================================================
417// Honest Type Aliases
418// ============================================================================
419// The names below more accurately describe the implementation:
420// - "Semantic" implies neural/ML understanding, but we use heuristics
421// - These aliases are provided for clarity and recommended for new code
422
423/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
424///
425/// This analyzer uses character-frequency heuristics for similarity detection,
426/// NOT neural network embeddings. Use this alias when you want to be explicit
427/// about the implementation approach.
428pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
429
430/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
431///
432/// This compressor uses chunk-based heuristics with optional character-frequency
433/// clustering, NOT neural semantic understanding. Use this alias when you want
434/// to be explicit about the implementation approach.
435pub type HeuristicCompressor = SemanticCompressor;
436
437/// Alias for `SemanticConfig` - more honest name.
438pub type HeuristicCompressionConfig = SemanticConfig;
439
440// ============================================================================
441// Utility Functions
442// ============================================================================
443
444/// Compute cosine similarity between two vectors
445///
446/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
447/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
448/// opposite direction.
449///
450/// # Note
451/// This function is used by the embeddings feature for clustering and
452/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
453/// builds without the embeddings feature.
454#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
455fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
456 if a.len() != b.len() || a.is_empty() {
457 return 0.0;
458 }
459
460 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
461 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
462 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
463
464 if norm_a == 0.0 || norm_b == 0.0 {
465 return 0.0;
466 }
467
468 dot / (norm_a * norm_b)
469}
470
471// ============================================================================
472// Tests
473// ============================================================================
474
475#[cfg(test)]
476mod tests {
477 use super::*;
478
479 #[test]
480 fn test_analyzer_creation() {
481 let analyzer = SemanticAnalyzer::new();
482 // Verify analyzer is created successfully
483 // Model path is None by default (accessed via model_path() when embeddings enabled)
484 #[cfg(feature = "embeddings")]
485 assert!(analyzer.model_path().is_none());
486 #[cfg(not(feature = "embeddings"))]
487 drop(analyzer); // Explicitly drop to satisfy lint
488 }
489
490 #[test]
491 fn test_analyzer_with_model() {
492 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
493 #[cfg(feature = "embeddings")]
494 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
495 #[cfg(not(feature = "embeddings"))]
496 drop(analyzer); // Explicitly drop to satisfy lint
497 }
498
499 #[test]
500 fn test_compressor_analyzer_access() {
501 let compressor = SemanticCompressor::new();
502 // Verify we can access the analyzer through the compressor
503 let _analyzer = compressor.analyzer();
504 }
505
506 #[test]
507 fn test_semantic_config_default() {
508 let config = SemanticConfig::default();
509 assert_eq!(config.similarity_threshold, 0.7);
510 assert_eq!(config.budget_ratio, 0.5);
511 }
512
513 #[test]
514 fn test_split_into_chunks() {
515 let compressor = SemanticCompressor::with_config(SemanticConfig {
516 min_chunk_size: 10,
517 max_chunk_size: 1000,
518 ..Default::default()
519 });
520
521 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
522 let chunks = compressor.split_into_chunks(content);
523 assert!(chunks.len() >= 2);
524 }
525
526 #[test]
527 fn test_heuristic_compression() {
528 let compressor = SemanticCompressor::with_config(SemanticConfig {
529 min_chunk_size: 5,
530 max_chunk_size: 100,
531 budget_ratio: 0.5,
532 ..Default::default()
533 });
534
535 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
536 let result = compressor.compress_heuristic(content).unwrap();
537 // Should complete without error
538 assert!(!result.is_empty() || content.is_empty());
539 }
540
541 #[test]
542 fn test_empty_content() {
543 let compressor = SemanticCompressor::new();
544 let result = compressor.compress("").unwrap();
545 assert_eq!(result, "");
546 }
547
548 #[test]
549 fn test_cosine_similarity_identical() {
550 let a = vec![1.0, 0.0, 0.0];
551 let b = vec![1.0, 0.0, 0.0];
552 let sim = cosine_similarity(&a, &b);
553 assert!((sim - 1.0).abs() < 0.001);
554 }
555
556 #[test]
557 fn test_cosine_similarity_orthogonal() {
558 let a = vec![1.0, 0.0, 0.0];
559 let c = vec![0.0, 1.0, 0.0];
560 let sim = cosine_similarity(&a, &c);
561 assert!(sim.abs() < 0.001);
562 }
563
564 #[test]
565 fn test_cosine_similarity_empty() {
566 let a: Vec<f32> = vec![];
567 let b: Vec<f32> = vec![];
568 assert_eq!(cosine_similarity(&a, &b), 0.0);
569 }
570}