infiniloom_engine/
semantic.rs

1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35/// Result type for semantic operations
36pub type Result<T> = std::result::Result<T, SemanticError>;
37
38/// Errors that can occur during semantic operations
39#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41    #[error("Model loading failed: {0}")]
42    ModelLoadError(String),
43
44    #[error("Embedding generation failed: {0}")]
45    EmbeddingError(String),
46
47    #[error("Clustering failed: {0}")]
48    ClusteringError(String),
49
50    #[error("Feature not available: embeddings feature not enabled")]
51    FeatureNotEnabled,
52}
53
54// ============================================================================
55// Semantic Analyzer (for similarity and embeddings)
56// ============================================================================
57
58/// Semantic analyzer using code embeddings
59///
60/// When the `embeddings` feature is enabled, uses the configured model path
61/// for neural network-based embeddings. Without the feature, provides
62/// heuristic-based similarity estimates.
63#[derive(Debug)]
64pub struct SemanticAnalyzer {
65    /// Path to the embedding model (used when embeddings feature is enabled)
66    #[cfg(feature = "embeddings")]
67    model_path: Option<String>,
68    /// Placeholder for non-embeddings build (maintains API compatibility)
69    #[cfg(not(feature = "embeddings"))]
70    _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74    /// Create a new semantic analyzer
75    pub fn new() -> Self {
76        Self {
77            #[cfg(feature = "embeddings")]
78            model_path: None,
79            #[cfg(not(feature = "embeddings"))]
80            _model_path: None,
81        }
82    }
83
84    /// Create a semantic analyzer with a custom model path
85    ///
86    /// The model path is used when the `embeddings` feature is enabled.
87    /// Without the feature, the path is stored but not used.
88    pub fn with_model(model_path: &str) -> Self {
89        Self {
90            #[cfg(feature = "embeddings")]
91            model_path: Some(model_path.to_owned()),
92            #[cfg(not(feature = "embeddings"))]
93            _model_path: Some(model_path.to_owned()),
94        }
95    }
96
97    /// Get the configured model path (if any)
98    #[cfg(feature = "embeddings")]
99    pub fn model_path(&self) -> Option<&str> {
100        self.model_path.as_deref()
101    }
102
103    /// Generate embeddings for code content
104    ///
105    /// # Current Implementation
106    ///
107    /// Uses a character-frequency based embedding algorithm that:
108    /// 1. Creates a 384-dimensional vector (matching common transformer output size)
109    /// 2. Accumulates weighted character frequencies based on position
110    /// 3. Normalizes to unit length for cosine similarity
111    ///
112    /// This is a **lightweight placeholder** that provides reasonable similarity
113    /// estimates for code without requiring ML model dependencies. It captures:
114    /// - Character distribution patterns
115    /// - Position-weighted frequency (earlier chars weighted more)
116    /// - Basic structural patterns through punctuation distribution
117    ///
118    /// For production use cases requiring high accuracy, consider integrating
119    /// actual transformer embeddings.
120    #[cfg(feature = "embeddings")]
121    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122        // Character-frequency based embedding (see doc comment for rationale)
123        let mut embedding = vec![0.0f32; 384];
124        for (i, c) in content.chars().enumerate() {
125            let idx = (c as usize) % 384;
126            // Position-weighted contribution: earlier characters contribute more
127            embedding[idx] += 1.0 / ((i + 1) as f32);
128        }
129        // L2 normalize for cosine similarity
130        let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131        if norm > 0.0 {
132            for x in &mut embedding {
133                *x /= norm;
134            }
135        }
136        Ok(embedding)
137    }
138
139    /// Generate embeddings (stub when feature disabled)
140    #[cfg(not(feature = "embeddings"))]
141    pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142        Ok(vec![0.0; 384])
143    }
144
145    /// Calculate similarity between two code snippets
146    #[cfg(feature = "embeddings")]
147    pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148        let emb_a = self.embed(a)?;
149        let emb_b = self.embed(b)?;
150        Ok(cosine_similarity(&emb_a, &emb_b))
151    }
152
153    /// Calculate similarity (stub when feature disabled)
154    #[cfg(not(feature = "embeddings"))]
155    pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156        Ok(0.0)
157    }
158}
159
160impl Default for SemanticAnalyzer {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166// ============================================================================
167// Semantic Compressor (for reducing content while preserving meaning)
168// ============================================================================
169
170/// Configuration for semantic compression
171#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173    /// Similarity threshold for clustering (0.0 - 1.0)
174    pub similarity_threshold: f32,
175    /// Minimum chunk size in characters
176    pub min_chunk_size: usize,
177    /// Maximum chunk size in characters
178    pub max_chunk_size: usize,
179    /// Budget ratio (0.0 - 1.0) - target size relative to original
180    pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184    fn default() -> Self {
185        Self {
186            similarity_threshold: 0.7,
187            min_chunk_size: 100,
188            max_chunk_size: 2000,
189            budget_ratio: 0.5,
190        }
191    }
192}
193
194/// A chunk of code
195#[derive(Debug, Clone)]
196pub struct CodeChunk {
197    /// The original content
198    pub content: String,
199    /// Start offset in original content
200    pub start: usize,
201    /// End offset in original content
202    pub end: usize,
203    /// Embedding vector (when computed)
204    pub embedding: Option<Vec<f32>>,
205    /// Cluster assignment
206    pub cluster_id: Option<usize>,
207}
208
209/// Semantic compressor for code content
210///
211/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
212/// otherwise falls back to heuristic-based compression.
213pub struct SemanticCompressor {
214    config: SemanticConfig,
215    /// Semantic analyzer for generating embeddings and computing similarity
216    analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220    /// Create a new semantic compressor with default config
221    pub fn new() -> Self {
222        Self::with_config(SemanticConfig::default())
223    }
224
225    /// Create a new semantic compressor with custom config
226    pub fn with_config(config: SemanticConfig) -> Self {
227        Self { config, analyzer: SemanticAnalyzer::new() }
228    }
229
230    /// Get a reference to the internal semantic analyzer
231    ///
232    /// This allows access to the analyzer for similarity computations
233    /// or custom embedding operations.
234    pub fn analyzer(&self) -> &SemanticAnalyzer {
235        &self.analyzer
236    }
237
238    /// Compress content semantically
239    ///
240    /// When the `embeddings` feature is enabled, uses neural embeddings
241    /// to cluster similar code chunks and select representatives.
242    ///
243    /// Without the feature, falls back to heuristic-based compression.
244    pub fn compress(&self, content: &str) -> Result<String> {
245        // First, check for repetitive content (Bug #6 fix)
246        if let Some(compressed) = self.compress_repetitive(content) {
247            return Ok(compressed);
248        }
249
250        #[cfg(feature = "embeddings")]
251        {
252            return self.compress_with_embeddings(content);
253        }
254
255        #[cfg(not(feature = "embeddings"))]
256        {
257            self.compress_heuristic(content)
258        }
259    }
260
261    /// Detect and compress repetitive content (Bug #6 fix)
262    ///
263    /// Handles cases like "sentence ".repeat(500) by detecting the repeated pattern
264    /// and returning a compressed representation.
265    fn compress_repetitive(&self, content: &str) -> Option<String> {
266        // Only process content above a minimum threshold
267        if content.len() < 200 {
268            return None;
269        }
270
271        // Try to find a repeating pattern
272        // Start with small patterns and work up
273        for pattern_len in 1..=100.min(content.len() / 3) {
274            let pattern = &content[..pattern_len];
275
276            // Skip patterns that are just whitespace
277            if pattern.chars().all(|c| c.is_whitespace()) {
278                continue;
279            }
280
281            // Count how many times this pattern repeats consecutively
282            let mut count = 0;
283            let mut pos = 0;
284            while pos + pattern_len <= content.len() {
285                if &content[pos..pos + pattern_len] == pattern {
286                    count += 1;
287                    pos += pattern_len;
288                } else {
289                    break;
290                }
291            }
292
293            // If pattern repeats enough times and covers most of the content
294            let coverage = (count * pattern_len) as f32 / content.len() as f32;
295            if count >= 3 && coverage >= 0.8 {
296                // Calculate how many instances to keep based on budget_ratio
297                let instances_to_show = (count as f32 * self.config.budget_ratio)
298                    .ceil()
299                    .clamp(1.0, 5.0) as usize;
300
301                let shown_content = pattern.repeat(instances_to_show);
302                let remainder = &content[count * pattern_len..];
303
304                let result = if remainder.is_empty() {
305                    format!(
306                        "{}\n/* ... pattern repeated {} times (showing {}) ... */",
307                        shown_content.trim_end(),
308                        count,
309                        instances_to_show
310                    )
311                } else {
312                    format!(
313                        "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
314                        shown_content.trim_end(),
315                        count,
316                        instances_to_show,
317                        remainder.trim()
318                    )
319                };
320
321                return Some(result);
322            }
323        }
324
325        // Also detect line-based repetition (same line repeated many times)
326        let lines: Vec<&str> = content.lines().collect();
327        if lines.len() >= 3 {
328            let mut line_counts: std::collections::HashMap<&str, usize> =
329                std::collections::HashMap::new();
330            for line in &lines {
331                *line_counts.entry(*line).or_insert(0) += 1;
332            }
333
334            // Find the most repeated line
335            if let Some((repeated_line, count)) = line_counts
336                .iter()
337                .filter(|(line, _)| !line.trim().is_empty())
338                .max_by_key(|(_, count)| *count)
339            {
340                let repetition_ratio = *count as f32 / lines.len() as f32;
341                if *count >= 3 && repetition_ratio >= 0.5 {
342                    // Build compressed output preserving unique lines
343                    let mut result = String::new();
344                    let mut consecutive_count = 0;
345                    let mut last_was_repeated = false;
346
347                    for line in &lines {
348                        if *line == *repeated_line {
349                            consecutive_count += 1;
350                            if !last_was_repeated {
351                                if !result.is_empty() {
352                                    result.push('\n');
353                                }
354                                result.push_str(line);
355                            }
356                            last_was_repeated = true;
357                        } else {
358                            if last_was_repeated && consecutive_count > 1 {
359                                result.push_str(&format!(
360                                    "\n/* ... above line repeated {} times ... */",
361                                    consecutive_count
362                                ));
363                            }
364                            consecutive_count = 0;
365                            last_was_repeated = false;
366                            if !result.is_empty() {
367                                result.push('\n');
368                            }
369                            result.push_str(line);
370                        }
371                    }
372
373                    if last_was_repeated && consecutive_count > 1 {
374                        result.push_str(&format!(
375                            "\n/* ... above line repeated {} times ... */",
376                            consecutive_count
377                        ));
378                    }
379
380                    // Only return if we actually compressed significantly
381                    if result.len() < content.len() / 2 {
382                        return Some(result);
383                    }
384                }
385            }
386        }
387
388        None
389    }
390
391    /// Split content into semantic chunks (Bug #6 fix - handles content without \n\n)
392    fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
393        let mut chunks = Vec::new();
394        let mut current_start = 0;
395
396        // First try: Split on double newlines (paragraph-like boundaries)
397        for (i, _) in content.match_indices("\n\n") {
398            if i > current_start && i - current_start >= self.config.min_chunk_size {
399                let chunk_content = &content[current_start..i];
400                if chunk_content.len() <= self.config.max_chunk_size {
401                    chunks.push(CodeChunk {
402                        content: chunk_content.to_owned(),
403                        start: current_start,
404                        end: i,
405                        embedding: None,
406                        cluster_id: None,
407                    });
408                }
409                current_start = i + 2;
410            }
411        }
412
413        // Handle remaining content
414        if current_start < content.len() {
415            let remaining = &content[current_start..];
416            if remaining.len() >= self.config.min_chunk_size {
417                chunks.push(CodeChunk {
418                    content: remaining.to_owned(),
419                    start: current_start,
420                    end: content.len(),
421                    embedding: None,
422                    cluster_id: None,
423                });
424            }
425        }
426
427        // Fallback: If no chunks found (no \n\n separators), try single newlines
428        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
429            current_start = 0;
430            for (i, _) in content.match_indices('\n') {
431                if i > current_start && i - current_start >= self.config.min_chunk_size {
432                    let chunk_content = &content[current_start..i];
433                    if chunk_content.len() <= self.config.max_chunk_size {
434                        chunks.push(CodeChunk {
435                            content: chunk_content.to_owned(),
436                            start: current_start,
437                            end: i,
438                            embedding: None,
439                            cluster_id: None,
440                        });
441                    }
442                    current_start = i + 1;
443                }
444            }
445            // Handle remaining after single newline split
446            if current_start < content.len() {
447                let remaining = &content[current_start..];
448                if remaining.len() >= self.config.min_chunk_size {
449                    chunks.push(CodeChunk {
450                        content: remaining.to_owned(),
451                        start: current_start,
452                        end: content.len(),
453                        embedding: None,
454                        cluster_id: None,
455                    });
456                }
457            }
458        }
459
460        // Second fallback: If still no chunks, split by sentence boundaries (. followed by space)
461        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
462            current_start = 0;
463            for (i, _) in content.match_indices(". ") {
464                if i > current_start && i - current_start >= self.config.min_chunk_size {
465                    let chunk_content = &content[current_start..=i]; // include the period
466                    if chunk_content.len() <= self.config.max_chunk_size {
467                        chunks.push(CodeChunk {
468                            content: chunk_content.to_owned(),
469                            start: current_start,
470                            end: i + 1,
471                            embedding: None,
472                            cluster_id: None,
473                        });
474                    }
475                    current_start = i + 2;
476                }
477            }
478            // Handle remaining
479            if current_start < content.len() {
480                let remaining = &content[current_start..];
481                if remaining.len() >= self.config.min_chunk_size {
482                    chunks.push(CodeChunk {
483                        content: remaining.to_owned(),
484                        start: current_start,
485                        end: content.len(),
486                        embedding: None,
487                        cluster_id: None,
488                    });
489                }
490            }
491        }
492
493        // Final fallback: If content is large but can't be split, force split by max_chunk_size
494        if chunks.is_empty() && content.len() > self.config.max_chunk_size {
495            let mut pos = 0;
496            while pos < content.len() {
497                let end = (pos + self.config.max_chunk_size).min(content.len());
498                chunks.push(CodeChunk {
499                    content: content[pos..end].to_owned(),
500                    start: pos,
501                    end,
502                    embedding: None,
503                    cluster_id: None,
504                });
505                pos = end;
506            }
507        }
508
509        chunks
510    }
511
512    /// Compress using heuristic methods (fallback when embeddings unavailable)
513    fn compress_heuristic(&self, content: &str) -> Result<String> {
514        let chunks = self.split_into_chunks(content);
515
516        if chunks.is_empty() {
517            return Ok(content.to_owned());
518        }
519
520        // Keep every Nth chunk based on budget ratio
521        let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
522        let step = chunks.len() / target_chunks.max(1);
523
524        let mut result = String::new();
525        let mut kept = 0;
526
527        for (i, chunk) in chunks.iter().enumerate() {
528            if i % step.max(1) == 0 && kept < target_chunks {
529                if !result.is_empty() {
530                    result.push_str("\n\n");
531                }
532                result.push_str(&chunk.content);
533                kept += 1;
534            }
535        }
536
537        // Add truncation marker if we removed content
538        if kept < chunks.len() {
539            result.push_str(&format!(
540                "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
541                chunks.len() - kept,
542                (kept as f32 / chunks.len() as f32) * 100.0
543            ));
544        }
545
546        Ok(result)
547    }
548
549    /// Compress using neural embeddings
550    #[cfg(feature = "embeddings")]
551    fn compress_with_embeddings(&self, content: &str) -> Result<String> {
552        let mut chunks = self.split_into_chunks(content);
553
554        if chunks.is_empty() {
555            return Ok(content.to_owned());
556        }
557
558        // Generate embeddings for each chunk
559        for chunk in &mut chunks {
560            chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
561        }
562
563        // Cluster similar chunks
564        let clusters = self.cluster_chunks(&chunks)?;
565
566        // Select representative from each cluster
567        let mut result = String::new();
568        for cluster in clusters.values() {
569            if let Some(representative) = self.select_representative(cluster) {
570                if !result.is_empty() {
571                    result.push_str("\n\n");
572                }
573                result.push_str(&representative.content);
574            }
575        }
576
577        Ok(result)
578    }
579
580    /// Cluster chunks by embedding similarity
581    #[cfg(feature = "embeddings")]
582    fn cluster_chunks<'a>(
583        &self,
584        chunks: &'a [CodeChunk],
585    ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
586        let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
587        let mut next_cluster = 0;
588
589        for chunk in chunks {
590            let embedding = chunk
591                .embedding
592                .as_ref()
593                .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
594
595            // Find existing cluster with similar embedding
596            let mut target_cluster = None;
597            for (&cluster_id, cluster_chunks) in &clusters {
598                if let Some(first) = cluster_chunks.first() {
599                    if let Some(ref first_emb) = first.embedding {
600                        let similarity = cosine_similarity(embedding, first_emb);
601                        if similarity >= self.config.similarity_threshold {
602                            target_cluster = Some(cluster_id);
603                            break;
604                        }
605                    }
606                }
607            }
608
609            if let Some(cluster_id) = target_cluster {
610                if let Some(cluster) = clusters.get_mut(&cluster_id) {
611                    cluster.push(chunk);
612                }
613            } else {
614                clusters.insert(next_cluster, vec![chunk]);
615                next_cluster += 1;
616            }
617        }
618
619        Ok(clusters)
620    }
621
622    /// Select the best representative from a cluster
623    #[cfg(feature = "embeddings")]
624    fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
625        // Select the longest chunk as representative (most informative)
626        chunks.iter().max_by_key(|c| c.content.len()).copied()
627    }
628}
629
630impl Default for SemanticCompressor {
631    fn default() -> Self {
632        Self::new()
633    }
634}
635
636// ============================================================================
637// Honest Type Aliases
638// ============================================================================
639// The names below more accurately describe the implementation:
640// - "Semantic" implies neural/ML understanding, but we use heuristics
641// - These aliases are provided for clarity and recommended for new code
642
643/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
644///
645/// This analyzer uses character-frequency heuristics for similarity detection,
646/// NOT neural network embeddings. Use this alias when you want to be explicit
647/// about the implementation approach.
648pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
649
650/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
651///
652/// This compressor uses chunk-based heuristics with optional character-frequency
653/// clustering, NOT neural semantic understanding. Use this alias when you want
654/// to be explicit about the implementation approach.
655pub type HeuristicCompressor = SemanticCompressor;
656
657/// Alias for `SemanticConfig` - more honest name.
658pub type HeuristicCompressionConfig = SemanticConfig;
659
660// ============================================================================
661// Utility Functions
662// ============================================================================
663
664/// Compute cosine similarity between two vectors
665///
666/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
667/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
668/// opposite direction.
669///
670/// # Note
671/// This function is used by the embeddings feature for clustering and
672/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
673/// builds without the embeddings feature.
674#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
675fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
676    if a.len() != b.len() || a.is_empty() {
677        return 0.0;
678    }
679
680    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
681    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
682    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
683
684    if norm_a == 0.0 || norm_b == 0.0 {
685        return 0.0;
686    }
687
688    dot / (norm_a * norm_b)
689}
690
691// ============================================================================
692// Tests
693// ============================================================================
694
695#[cfg(test)]
696mod tests {
697    use super::*;
698
699    #[test]
700    fn test_analyzer_creation() {
701        let analyzer = SemanticAnalyzer::new();
702        // Verify analyzer is created successfully
703        // Model path is None by default (accessed via model_path() when embeddings enabled)
704        #[cfg(feature = "embeddings")]
705        assert!(analyzer.model_path().is_none());
706        #[cfg(not(feature = "embeddings"))]
707        drop(analyzer); // Explicitly drop to satisfy lint
708    }
709
710    #[test]
711    fn test_analyzer_with_model() {
712        let analyzer = SemanticAnalyzer::with_model("/path/to/model");
713        #[cfg(feature = "embeddings")]
714        assert_eq!(analyzer.model_path(), Some("/path/to/model"));
715        #[cfg(not(feature = "embeddings"))]
716        drop(analyzer); // Explicitly drop to satisfy lint
717    }
718
719    #[test]
720    fn test_compressor_analyzer_access() {
721        let compressor = SemanticCompressor::new();
722        // Verify we can access the analyzer through the compressor
723        let _analyzer = compressor.analyzer();
724    }
725
726    #[test]
727    fn test_semantic_config_default() {
728        let config = SemanticConfig::default();
729        assert_eq!(config.similarity_threshold, 0.7);
730        assert_eq!(config.budget_ratio, 0.5);
731    }
732
733    #[test]
734    fn test_split_into_chunks() {
735        let compressor = SemanticCompressor::with_config(SemanticConfig {
736            min_chunk_size: 10,
737            max_chunk_size: 1000,
738            ..Default::default()
739        });
740
741        let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
742        let chunks = compressor.split_into_chunks(content);
743        assert!(chunks.len() >= 2);
744    }
745
746    #[test]
747    fn test_heuristic_compression() {
748        let compressor = SemanticCompressor::with_config(SemanticConfig {
749            min_chunk_size: 5,
750            max_chunk_size: 100,
751            budget_ratio: 0.5,
752            ..Default::default()
753        });
754
755        let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
756        let result = compressor.compress_heuristic(content).unwrap();
757        // Should complete without error
758        assert!(!result.is_empty() || content.is_empty());
759    }
760
761    #[test]
762    fn test_empty_content() {
763        let compressor = SemanticCompressor::new();
764        let result = compressor.compress("").unwrap();
765        assert_eq!(result, "");
766    }
767
768    #[test]
769    fn test_cosine_similarity_identical() {
770        let a = vec![1.0, 0.0, 0.0];
771        let b = vec![1.0, 0.0, 0.0];
772        let sim = cosine_similarity(&a, &b);
773        assert!((sim - 1.0).abs() < 0.001);
774    }
775
776    #[test]
777    fn test_cosine_similarity_orthogonal() {
778        let a = vec![1.0, 0.0, 0.0];
779        let c = vec![0.0, 1.0, 0.0];
780        let sim = cosine_similarity(&a, &c);
781        assert!(sim.abs() < 0.001);
782    }
783
784    #[test]
785    fn test_cosine_similarity_empty() {
786        let a: Vec<f32> = vec![];
787        let b: Vec<f32> = vec![];
788        assert_eq!(cosine_similarity(&a, &b), 0.0);
789    }
790
791    // Bug #6 tests - repetitive content compression
792    #[test]
793    fn test_repetitive_pattern_compression() {
794        let compressor = SemanticCompressor::new();
795        // Test "sentence ".repeat(500) - exactly the reported bug case
796        let content = "sentence ".repeat(500);
797        let result = compressor.compress(&content).unwrap();
798
799        // Result should be significantly smaller than original
800        assert!(
801            result.len() < content.len() / 2,
802            "Compressed size {} should be less than half of original {}",
803            result.len(),
804            content.len()
805        );
806
807        // Should contain the pattern and a compression marker
808        assert!(result.contains("sentence"));
809        assert!(
810            result.contains("repeated") || result.contains("pattern"),
811            "Should indicate compression occurred"
812        );
813    }
814
815    #[test]
816    fn test_repetitive_line_compression() {
817        let compressor = SemanticCompressor::new();
818        // Test repeated lines
819        let content = "same line\n".repeat(100);
820        let result = compressor.compress(&content).unwrap();
821
822        // Result should be significantly smaller
823        assert!(
824            result.len() < content.len() / 2,
825            "Compressed size {} should be less than half of original {}",
826            result.len(),
827            content.len()
828        );
829    }
830
831    #[test]
832    fn test_non_repetitive_content_unchanged() {
833        let compressor = SemanticCompressor::new();
834        // Non-repetitive content should not trigger repetition compression
835        let content = "This is some unique content that does not repeat.";
836        let result = compressor.compress(content).unwrap();
837
838        // Short non-repetitive content should be returned as-is
839        assert_eq!(result, content);
840    }
841
842    #[test]
843    fn test_repetitive_with_variation() {
844        let compressor = SemanticCompressor::with_config(SemanticConfig {
845            budget_ratio: 0.3,
846            ..Default::default()
847        });
848
849        // Content with some repetition mixed with unique parts
850        let mut content = String::new();
851        for i in 0..50 {
852            content.push_str(&format!("item {} ", i % 5)); // Repeated pattern with variation
853        }
854
855        let result = compressor.compress(&content).unwrap();
856        // This may or may not compress depending on pattern detection
857        // Just verify it doesn't panic
858        assert!(!result.is_empty());
859    }
860}