infiniloom_engine/
semantic.rs

1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35/// Result type for semantic operations
36pub type Result<T> = std::result::Result<T, SemanticError>;
37
38/// Errors that can occur during semantic operations
39#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41    #[error("Model loading failed: {0}")]
42    ModelLoadError(String),
43
44    #[error("Embedding generation failed: {0}")]
45    EmbeddingError(String),
46
47    #[error("Clustering failed: {0}")]
48    ClusteringError(String),
49
50    #[error("Feature not available: embeddings feature not enabled")]
51    FeatureNotEnabled,
52}
53
54// ============================================================================
55// Semantic Analyzer (for similarity and embeddings)
56// ============================================================================
57
58/// Semantic analyzer using code embeddings
59///
60/// When the `embeddings` feature is enabled, uses the configured model path
61/// for neural network-based embeddings. Without the feature, provides
62/// heuristic-based similarity estimates.
63#[derive(Debug)]
64pub struct SemanticAnalyzer {
65    /// Path to the embedding model (used when embeddings feature is enabled)
66    #[cfg(feature = "embeddings")]
67    model_path: Option<String>,
68    /// Placeholder for non-embeddings build (maintains API compatibility)
69    #[cfg(not(feature = "embeddings"))]
70    _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74    /// Create a new semantic analyzer
75    pub fn new() -> Self {
76        Self {
77            #[cfg(feature = "embeddings")]
78            model_path: None,
79            #[cfg(not(feature = "embeddings"))]
80            _model_path: None,
81        }
82    }
83
84    /// Create a semantic analyzer with a custom model path
85    ///
86    /// The model path is used when the `embeddings` feature is enabled.
87    /// Without the feature, the path is stored but not used.
88    pub fn with_model(model_path: &str) -> Self {
89        Self {
90            #[cfg(feature = "embeddings")]
91            model_path: Some(model_path.to_owned()),
92            #[cfg(not(feature = "embeddings"))]
93            _model_path: Some(model_path.to_owned()),
94        }
95    }
96
97    /// Get the configured model path (if any)
98    #[cfg(feature = "embeddings")]
99    pub fn model_path(&self) -> Option<&str> {
100        self.model_path.as_deref()
101    }
102
103    /// Generate embeddings for code content
104    ///
105    /// # Current Implementation
106    ///
107    /// Uses a character-frequency based embedding algorithm that:
108    /// 1. Creates a 384-dimensional vector (matching common transformer output size)
109    /// 2. Accumulates weighted character frequencies based on position
110    /// 3. Normalizes to unit length for cosine similarity
111    ///
112    /// This is a **lightweight placeholder** that provides reasonable similarity
113    /// estimates for code without requiring ML model dependencies. It captures:
114    /// - Character distribution patterns
115    /// - Position-weighted frequency (earlier chars weighted more)
116    /// - Basic structural patterns through punctuation distribution
117    ///
118    /// For production use cases requiring high accuracy, consider integrating
119    /// actual transformer embeddings.
120    #[cfg(feature = "embeddings")]
121    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122        // Character-frequency based embedding (see doc comment for rationale)
123        let mut embedding = vec![0.0f32; 384];
124        for (i, c) in content.chars().enumerate() {
125            let idx = (c as usize) % 384;
126            // Position-weighted contribution: earlier characters contribute more
127            embedding[idx] += 1.0 / ((i + 1) as f32);
128        }
129        // L2 normalize for cosine similarity
130        let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131        if norm > 0.0 {
132            for x in &mut embedding {
133                *x /= norm;
134            }
135        }
136        Ok(embedding)
137    }
138
139    /// Generate embeddings (stub when feature disabled)
140    #[cfg(not(feature = "embeddings"))]
141    pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142        Ok(vec![0.0; 384])
143    }
144
145    /// Calculate similarity between two code snippets
146    #[cfg(feature = "embeddings")]
147    pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148        let emb_a = self.embed(a)?;
149        let emb_b = self.embed(b)?;
150        Ok(cosine_similarity(&emb_a, &emb_b))
151    }
152
153    /// Calculate similarity (stub when feature disabled)
154    #[cfg(not(feature = "embeddings"))]
155    pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156        Ok(0.0)
157    }
158}
159
160impl Default for SemanticAnalyzer {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166// ============================================================================
167// Semantic Compressor (for reducing content while preserving meaning)
168// ============================================================================
169
170/// Configuration for semantic compression
171#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173    /// Similarity threshold for clustering (0.0 - 1.0)
174    pub similarity_threshold: f32,
175    /// Minimum chunk size in characters
176    pub min_chunk_size: usize,
177    /// Maximum chunk size in characters
178    pub max_chunk_size: usize,
179    /// Budget ratio (0.0 - 1.0) - target size relative to original
180    pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184    fn default() -> Self {
185        Self {
186            similarity_threshold: 0.7,
187            min_chunk_size: 100,
188            max_chunk_size: 2000,
189            budget_ratio: 0.5,
190        }
191    }
192}
193
194/// A chunk of code
195#[derive(Debug, Clone)]
196pub struct CodeChunk {
197    /// The original content
198    pub content: String,
199    /// Start offset in original content
200    pub start: usize,
201    /// End offset in original content
202    pub end: usize,
203    /// Embedding vector (when computed)
204    pub embedding: Option<Vec<f32>>,
205    /// Cluster assignment
206    pub cluster_id: Option<usize>,
207}
208
209/// Semantic compressor for code content
210///
211/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
212/// otherwise falls back to heuristic-based compression.
213pub struct SemanticCompressor {
214    config: SemanticConfig,
215    /// Semantic analyzer for generating embeddings and computing similarity
216    analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220    /// Create a new semantic compressor with default config
221    pub fn new() -> Self {
222        Self::with_config(SemanticConfig::default())
223    }
224
225    /// Create a new semantic compressor with custom config
226    pub fn with_config(config: SemanticConfig) -> Self {
227        Self { config, analyzer: SemanticAnalyzer::new() }
228    }
229
230    /// Get a reference to the internal semantic analyzer
231    ///
232    /// This allows access to the analyzer for similarity computations
233    /// or custom embedding operations.
234    pub fn analyzer(&self) -> &SemanticAnalyzer {
235        &self.analyzer
236    }
237
238    /// Compress content semantically
239    ///
240    /// When the `embeddings` feature is enabled, uses neural embeddings
241    /// to cluster similar code chunks and select representatives.
242    ///
243    /// Without the feature, falls back to heuristic-based compression.
244    pub fn compress(&self, content: &str) -> Result<String> {
245        // First, check for repetitive content (Bug #6 fix)
246        if let Some(compressed) = self.compress_repetitive(content) {
247            return Ok(compressed);
248        }
249
250        #[cfg(feature = "embeddings")]
251        {
252            return self.compress_with_embeddings(content);
253        }
254
255        #[cfg(not(feature = "embeddings"))]
256        {
257            self.compress_heuristic(content)
258        }
259    }
260
261    /// Detect and compress repetitive content (Bug #6 fix)
262    ///
263    /// Handles cases like "sentence ".repeat(500) by detecting the repeated pattern
264    /// and returning a compressed representation.
265    ///
266    /// This function is UTF-8 safe - it only slices at valid character boundaries.
267    fn compress_repetitive(&self, content: &str) -> Option<String> {
268        // Only process content above a minimum threshold
269        if content.len() < 200 {
270            return None;
271        }
272
273        // Try to find a repeating pattern
274        // Start with small patterns and work up
275        // We iterate byte positions but only consider those that are valid UTF-8 boundaries
276        for pattern_len in 1..=100.min(content.len() / 3) {
277            // Skip if this byte position is not a valid UTF-8 character boundary
278            if !content.is_char_boundary(pattern_len) {
279                continue;
280            }
281
282            let pattern = &content[..pattern_len];
283
284            // Skip patterns that are just whitespace
285            if pattern.chars().all(|c| c.is_whitespace()) {
286                continue;
287            }
288
289            // Count how many times this pattern repeats consecutively
290            let mut count = 0;
291            let mut pos = 0;
292            while pos + pattern_len <= content.len() {
293                // Ensure both slice boundaries are valid UTF-8
294                if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295                    break;
296                }
297                if &content[pos..pos + pattern_len] == pattern {
298                    count += 1;
299                    pos += pattern_len;
300                } else {
301                    break;
302                }
303            }
304
305            // If pattern repeats enough times and covers most of the content
306            let coverage = (count * pattern_len) as f32 / content.len() as f32;
307            if count >= 3 && coverage >= 0.8 {
308                // Calculate how many instances to keep based on budget_ratio
309                let instances_to_show = (count as f32 * self.config.budget_ratio)
310                    .ceil()
311                    .clamp(1.0, 5.0) as usize;
312
313                let shown_content = pattern.repeat(instances_to_show);
314                // Safe: count * pattern_len is already at a valid boundary (start of next pattern or end)
315                let remainder_start = count * pattern_len;
316                let remainder = if remainder_start <= content.len()
317                    && content.is_char_boundary(remainder_start)
318                {
319                    &content[remainder_start..]
320                } else {
321                    ""
322                };
323
324                let result = if remainder.is_empty() {
325                    format!(
326                        "{}\n/* ... pattern repeated {} times (showing {}) ... */",
327                        shown_content.trim_end(),
328                        count,
329                        instances_to_show
330                    )
331                } else {
332                    format!(
333                        "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
334                        shown_content.trim_end(),
335                        count,
336                        instances_to_show,
337                        remainder.trim()
338                    )
339                };
340
341                return Some(result);
342            }
343        }
344
345        // Also detect line-based repetition (same line repeated many times)
346        let lines: Vec<&str> = content.lines().collect();
347        if lines.len() >= 3 {
348            let mut line_counts: std::collections::HashMap<&str, usize> =
349                std::collections::HashMap::new();
350            for line in &lines {
351                *line_counts.entry(*line).or_insert(0) += 1;
352            }
353
354            // Find the most repeated line
355            if let Some((repeated_line, count)) = line_counts
356                .iter()
357                .filter(|(line, _)| !line.trim().is_empty())
358                .max_by_key(|(_, count)| *count)
359            {
360                let repetition_ratio = *count as f32 / lines.len() as f32;
361                if *count >= 3 && repetition_ratio >= 0.5 {
362                    // Build compressed output preserving unique lines
363                    let mut result = String::new();
364                    let mut consecutive_count = 0;
365                    let mut last_was_repeated = false;
366
367                    for line in &lines {
368                        if *line == *repeated_line {
369                            consecutive_count += 1;
370                            if !last_was_repeated {
371                                if !result.is_empty() {
372                                    result.push('\n');
373                                }
374                                result.push_str(line);
375                            }
376                            last_was_repeated = true;
377                        } else {
378                            if last_was_repeated && consecutive_count > 1 {
379                                result.push_str(&format!(
380                                    "\n/* ... above line repeated {} times ... */",
381                                    consecutive_count
382                                ));
383                            }
384                            consecutive_count = 0;
385                            last_was_repeated = false;
386                            if !result.is_empty() {
387                                result.push('\n');
388                            }
389                            result.push_str(line);
390                        }
391                    }
392
393                    if last_was_repeated && consecutive_count > 1 {
394                        result.push_str(&format!(
395                            "\n/* ... above line repeated {} times ... */",
396                            consecutive_count
397                        ));
398                    }
399
400                    // Only return if we actually compressed significantly
401                    if result.len() < content.len() / 2 {
402                        return Some(result);
403                    }
404                }
405            }
406        }
407
408        None
409    }
410
411    /// Split content into semantic chunks (Bug #6 fix - handles content without \n\n)
412    fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
413        let mut chunks = Vec::new();
414        let mut current_start = 0;
415
416        // First try: Split on double newlines (paragraph-like boundaries)
417        for (i, _) in content.match_indices("\n\n") {
418            if i > current_start && i - current_start >= self.config.min_chunk_size {
419                let chunk_content = &content[current_start..i];
420                if chunk_content.len() <= self.config.max_chunk_size {
421                    chunks.push(CodeChunk {
422                        content: chunk_content.to_owned(),
423                        start: current_start,
424                        end: i,
425                        embedding: None,
426                        cluster_id: None,
427                    });
428                }
429                current_start = i + 2;
430            }
431        }
432
433        // Handle remaining content
434        if current_start < content.len() {
435            let remaining = &content[current_start..];
436            if remaining.len() >= self.config.min_chunk_size {
437                chunks.push(CodeChunk {
438                    content: remaining.to_owned(),
439                    start: current_start,
440                    end: content.len(),
441                    embedding: None,
442                    cluster_id: None,
443                });
444            }
445        }
446
447        // Fallback: If no chunks found (no \n\n separators), try single newlines
448        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
449            current_start = 0;
450            for (i, _) in content.match_indices('\n') {
451                if i > current_start && i - current_start >= self.config.min_chunk_size {
452                    let chunk_content = &content[current_start..i];
453                    if chunk_content.len() <= self.config.max_chunk_size {
454                        chunks.push(CodeChunk {
455                            content: chunk_content.to_owned(),
456                            start: current_start,
457                            end: i,
458                            embedding: None,
459                            cluster_id: None,
460                        });
461                    }
462                    current_start = i + 1;
463                }
464            }
465            // Handle remaining after single newline split
466            if current_start < content.len() {
467                let remaining = &content[current_start..];
468                if remaining.len() >= self.config.min_chunk_size {
469                    chunks.push(CodeChunk {
470                        content: remaining.to_owned(),
471                        start: current_start,
472                        end: content.len(),
473                        embedding: None,
474                        cluster_id: None,
475                    });
476                }
477            }
478        }
479
480        // Second fallback: If still no chunks, split by sentence boundaries (. followed by space)
481        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
482            current_start = 0;
483            for (i, _) in content.match_indices(". ") {
484                if i > current_start && i - current_start >= self.config.min_chunk_size {
485                    let chunk_content = &content[current_start..=i]; // include the period
486                    if chunk_content.len() <= self.config.max_chunk_size {
487                        chunks.push(CodeChunk {
488                            content: chunk_content.to_owned(),
489                            start: current_start,
490                            end: i + 1,
491                            embedding: None,
492                            cluster_id: None,
493                        });
494                    }
495                    current_start = i + 2;
496                }
497            }
498            // Handle remaining
499            if current_start < content.len() {
500                let remaining = &content[current_start..];
501                if remaining.len() >= self.config.min_chunk_size {
502                    chunks.push(CodeChunk {
503                        content: remaining.to_owned(),
504                        start: current_start,
505                        end: content.len(),
506                        embedding: None,
507                        cluster_id: None,
508                    });
509                }
510            }
511        }
512
513        // Final fallback: If content is large but can't be split, force split by max_chunk_size
514        if chunks.is_empty() && content.len() > self.config.max_chunk_size {
515            let mut pos = 0;
516            while pos < content.len() {
517                let end = (pos + self.config.max_chunk_size).min(content.len());
518                chunks.push(CodeChunk {
519                    content: content[pos..end].to_owned(),
520                    start: pos,
521                    end,
522                    embedding: None,
523                    cluster_id: None,
524                });
525                pos = end;
526            }
527        }
528
529        chunks
530    }
531
532    /// Compress using heuristic methods (fallback when embeddings unavailable)
533    fn compress_heuristic(&self, content: &str) -> Result<String> {
534        let chunks = self.split_into_chunks(content);
535
536        if chunks.is_empty() {
537            return Ok(content.to_owned());
538        }
539
540        // Keep every Nth chunk based on budget ratio
541        let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
542        let step = chunks.len() / target_chunks.max(1);
543
544        let mut result = String::new();
545        let mut kept = 0;
546
547        for (i, chunk) in chunks.iter().enumerate() {
548            if i % step.max(1) == 0 && kept < target_chunks {
549                if !result.is_empty() {
550                    result.push_str("\n\n");
551                }
552                result.push_str(&chunk.content);
553                kept += 1;
554            }
555        }
556
557        // Add truncation marker if we removed content
558        if kept < chunks.len() {
559            result.push_str(&format!(
560                "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
561                chunks.len() - kept,
562                (kept as f32 / chunks.len() as f32) * 100.0
563            ));
564        }
565
566        Ok(result)
567    }
568
569    /// Compress using neural embeddings
570    #[cfg(feature = "embeddings")]
571    fn compress_with_embeddings(&self, content: &str) -> Result<String> {
572        let mut chunks = self.split_into_chunks(content);
573
574        if chunks.is_empty() {
575            return Ok(content.to_owned());
576        }
577
578        // Generate embeddings for each chunk
579        for chunk in &mut chunks {
580            chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
581        }
582
583        // Cluster similar chunks
584        let clusters = self.cluster_chunks(&chunks)?;
585
586        // Select representative from each cluster
587        let mut result = String::new();
588        for cluster in clusters.values() {
589            if let Some(representative) = self.select_representative(cluster) {
590                if !result.is_empty() {
591                    result.push_str("\n\n");
592                }
593                result.push_str(&representative.content);
594            }
595        }
596
597        Ok(result)
598    }
599
600    /// Cluster chunks by embedding similarity
601    #[cfg(feature = "embeddings")]
602    fn cluster_chunks<'a>(
603        &self,
604        chunks: &'a [CodeChunk],
605    ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
606        let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
607        let mut next_cluster = 0;
608
609        for chunk in chunks {
610            let embedding = chunk
611                .embedding
612                .as_ref()
613                .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
614
615            // Find existing cluster with similar embedding
616            let mut target_cluster = None;
617            for (&cluster_id, cluster_chunks) in &clusters {
618                if let Some(first) = cluster_chunks.first() {
619                    if let Some(ref first_emb) = first.embedding {
620                        let similarity = cosine_similarity(embedding, first_emb);
621                        if similarity >= self.config.similarity_threshold {
622                            target_cluster = Some(cluster_id);
623                            break;
624                        }
625                    }
626                }
627            }
628
629            if let Some(cluster_id) = target_cluster {
630                if let Some(cluster) = clusters.get_mut(&cluster_id) {
631                    cluster.push(chunk);
632                }
633            } else {
634                clusters.insert(next_cluster, vec![chunk]);
635                next_cluster += 1;
636            }
637        }
638
639        Ok(clusters)
640    }
641
642    /// Select the best representative from a cluster
643    #[cfg(feature = "embeddings")]
644    fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
645        // Select the longest chunk as representative (most informative)
646        chunks.iter().max_by_key(|c| c.content.len()).copied()
647    }
648}
649
650impl Default for SemanticCompressor {
651    fn default() -> Self {
652        Self::new()
653    }
654}
655
656// ============================================================================
657// Honest Type Aliases
658// ============================================================================
659// The names below more accurately describe the implementation:
660// - "Semantic" implies neural/ML understanding, but we use heuristics
661// - These aliases are provided for clarity and recommended for new code
662
663/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
664///
665/// This analyzer uses character-frequency heuristics for similarity detection,
666/// NOT neural network embeddings. Use this alias when you want to be explicit
667/// about the implementation approach.
668pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
669
670/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
671///
672/// This compressor uses chunk-based heuristics with optional character-frequency
673/// clustering, NOT neural semantic understanding. Use this alias when you want
674/// to be explicit about the implementation approach.
675pub type HeuristicCompressor = SemanticCompressor;
676
677/// Alias for `SemanticConfig` - more honest name.
678pub type HeuristicCompressionConfig = SemanticConfig;
679
680// ============================================================================
681// Utility Functions
682// ============================================================================
683
684/// Compute cosine similarity between two vectors
685///
686/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
687/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
688/// opposite direction.
689///
690/// # Note
691/// This function is used by the embeddings feature for clustering and
692/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
693/// builds without the embeddings feature.
694#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
695fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
696    if a.len() != b.len() || a.is_empty() {
697        return 0.0;
698    }
699
700    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
701    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
702    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
703
704    if norm_a == 0.0 || norm_b == 0.0 {
705        return 0.0;
706    }
707
708    dot / (norm_a * norm_b)
709}
710
711// ============================================================================
712// Tests
713// ============================================================================
714
715#[cfg(test)]
716mod tests {
717    use super::*;
718
719    #[test]
720    fn test_analyzer_creation() {
721        let analyzer = SemanticAnalyzer::new();
722        // Verify analyzer is created successfully
723        // Model path is None by default (accessed via model_path() when embeddings enabled)
724        #[cfg(feature = "embeddings")]
725        assert!(analyzer.model_path().is_none());
726        #[cfg(not(feature = "embeddings"))]
727        drop(analyzer); // Explicitly drop to satisfy lint
728    }
729
730    #[test]
731    fn test_analyzer_with_model() {
732        let analyzer = SemanticAnalyzer::with_model("/path/to/model");
733        #[cfg(feature = "embeddings")]
734        assert_eq!(analyzer.model_path(), Some("/path/to/model"));
735        #[cfg(not(feature = "embeddings"))]
736        drop(analyzer); // Explicitly drop to satisfy lint
737    }
738
739    #[test]
740    fn test_compressor_analyzer_access() {
741        let compressor = SemanticCompressor::new();
742        // Verify we can access the analyzer through the compressor
743        let _analyzer = compressor.analyzer();
744    }
745
746    #[test]
747    fn test_semantic_config_default() {
748        let config = SemanticConfig::default();
749        assert_eq!(config.similarity_threshold, 0.7);
750        assert_eq!(config.budget_ratio, 0.5);
751    }
752
753    #[test]
754    fn test_split_into_chunks() {
755        let compressor = SemanticCompressor::with_config(SemanticConfig {
756            min_chunk_size: 10,
757            max_chunk_size: 1000,
758            ..Default::default()
759        });
760
761        let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
762        let chunks = compressor.split_into_chunks(content);
763        assert!(chunks.len() >= 2);
764    }
765
766    #[test]
767    fn test_heuristic_compression() {
768        let compressor = SemanticCompressor::with_config(SemanticConfig {
769            min_chunk_size: 5,
770            max_chunk_size: 100,
771            budget_ratio: 0.5,
772            ..Default::default()
773        });
774
775        let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
776        let result = compressor.compress_heuristic(content).unwrap();
777        // Should complete without error
778        assert!(!result.is_empty() || content.is_empty());
779    }
780
781    #[test]
782    fn test_empty_content() {
783        let compressor = SemanticCompressor::new();
784        let result = compressor.compress("").unwrap();
785        assert_eq!(result, "");
786    }
787
788    #[test]
789    fn test_cosine_similarity_identical() {
790        let a = vec![1.0, 0.0, 0.0];
791        let b = vec![1.0, 0.0, 0.0];
792        let sim = cosine_similarity(&a, &b);
793        assert!((sim - 1.0).abs() < 0.001);
794    }
795
796    #[test]
797    fn test_cosine_similarity_orthogonal() {
798        let a = vec![1.0, 0.0, 0.0];
799        let c = vec![0.0, 1.0, 0.0];
800        let sim = cosine_similarity(&a, &c);
801        assert!(sim.abs() < 0.001);
802    }
803
804    #[test]
805    fn test_cosine_similarity_empty() {
806        let a: Vec<f32> = vec![];
807        let b: Vec<f32> = vec![];
808        assert_eq!(cosine_similarity(&a, &b), 0.0);
809    }
810
811    // Bug #6 tests - repetitive content compression
812    #[test]
813    fn test_repetitive_pattern_compression() {
814        let compressor = SemanticCompressor::new();
815        // Test "sentence ".repeat(500) - exactly the reported bug case
816        let content = "sentence ".repeat(500);
817        let result = compressor.compress(&content).unwrap();
818
819        // Result should be significantly smaller than original
820        assert!(
821            result.len() < content.len() / 2,
822            "Compressed size {} should be less than half of original {}",
823            result.len(),
824            content.len()
825        );
826
827        // Should contain the pattern and a compression marker
828        assert!(result.contains("sentence"));
829        assert!(
830            result.contains("repeated") || result.contains("pattern"),
831            "Should indicate compression occurred"
832        );
833    }
834
835    #[test]
836    fn test_repetitive_line_compression() {
837        let compressor = SemanticCompressor::new();
838        // Test repeated lines
839        let content = "same line\n".repeat(100);
840        let result = compressor.compress(&content).unwrap();
841
842        // Result should be significantly smaller
843        assert!(
844            result.len() < content.len() / 2,
845            "Compressed size {} should be less than half of original {}",
846            result.len(),
847            content.len()
848        );
849    }
850
851    #[test]
852    fn test_non_repetitive_content_unchanged() {
853        let compressor = SemanticCompressor::new();
854        // Non-repetitive content should not trigger repetition compression
855        let content = "This is some unique content that does not repeat.";
856        let result = compressor.compress(content).unwrap();
857
858        // Short non-repetitive content should be returned as-is
859        assert_eq!(result, content);
860    }
861
862    #[test]
863    fn test_repetitive_with_variation() {
864        let compressor = SemanticCompressor::with_config(SemanticConfig {
865            budget_ratio: 0.3,
866            ..Default::default()
867        });
868
869        // Content with some repetition mixed with unique parts
870        let mut content = String::new();
871        for i in 0..50 {
872            content.push_str(&format!("item {} ", i % 5)); // Repeated pattern with variation
873        }
874
875        let result = compressor.compress(&content).unwrap();
876        // This may or may not compress depending on pattern detection
877        // Just verify it doesn't panic
878        assert!(!result.is_empty());
879    }
880
881    // UTF-8 boundary safety tests for compress_repetitive
882    #[test]
883    fn test_repetitive_unicode_chinese() {
884        let compressor = SemanticCompressor::new();
885        // Chinese characters are 3 bytes each
886        // Create repeating Chinese pattern
887        let content = "中文测试 ".repeat(100); // Each repeat is 13 bytes
888        let result = compressor.compress(&content).unwrap();
889
890        // Should not panic and should produce valid UTF-8
891        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
892
893        // Should compress or return unchanged (not panic)
894        assert!(!result.is_empty() || content.is_empty());
895    }
896
897    #[test]
898    fn test_repetitive_unicode_emoji() {
899        let compressor = SemanticCompressor::new();
900        // Emoji are 4 bytes each
901        let content = "🎉🎊🎁 ".repeat(80); // Each repeat is 14 bytes
902
903        let result = compressor.compress(&content).unwrap();
904        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
905        assert!(!result.is_empty() || content.is_empty());
906    }
907
908    #[test]
909    fn test_repetitive_unicode_mixed() {
910        let compressor = SemanticCompressor::new();
911        // Mix of 1, 2, 3, and 4 byte characters
912        let content = "a中🎉 ".repeat(60); // Each repeat is 11 bytes
913
914        let result = compressor.compress(&content).unwrap();
915        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
916        assert!(!result.is_empty() || content.is_empty());
917    }
918
919    #[test]
920    fn test_repetitive_unicode_cyrillic() {
921        let compressor = SemanticCompressor::new();
922        // Cyrillic characters are 2 bytes each
923        let content = "Привет ".repeat(50);
924
925        let result = compressor.compress(&content).unwrap();
926        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
927    }
928
929    #[test]
930    fn test_non_repetitive_unicode_boundary() {
931        let compressor = SemanticCompressor::new();
932        // Content where pattern detection would try various byte lengths
933        // that don't align with UTF-8 boundaries
934        let content = "世界和平".repeat(60); // No spaces, pure multi-byte
935
936        let result = compressor.compress(&content).unwrap();
937        // Should not panic even when pattern length iteration
938        // hits non-UTF-8 boundaries
939        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
940    }
941
942    #[test]
943    fn test_repetitive_unicode_line_based() {
944        let compressor = SemanticCompressor::new();
945        // Test line-based repetition detection with Unicode
946        let content = "中文行\n".repeat(100);
947
948        let result = compressor.compress(&content).unwrap();
949        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
950    }
951}