infiniloom_engine/
semantic.rs

1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35/// Result type for semantic operations
36pub type Result<T> = std::result::Result<T, SemanticError>;
37
38/// Errors that can occur during semantic operations
39#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41    #[error("Model loading failed: {0}")]
42    ModelLoadError(String),
43
44    #[error("Embedding generation failed: {0}")]
45    EmbeddingError(String),
46
47    #[error("Clustering failed: {0}")]
48    ClusteringError(String),
49
50    #[error("Feature not available: embeddings feature not enabled")]
51    FeatureNotEnabled,
52}
53
54// ============================================================================
55// Semantic Analyzer (for similarity and embeddings)
56// ============================================================================
57
58/// Semantic analyzer using code embeddings
59///
60/// When the `embeddings` feature is enabled, uses the configured model path
61/// for neural network-based embeddings. Without the feature, provides
62/// heuristic-based similarity estimates.
63#[derive(Debug)]
64pub struct SemanticAnalyzer {
65    /// Path to the embedding model (used when embeddings feature is enabled)
66    #[cfg(feature = "embeddings")]
67    model_path: Option<String>,
68    /// Placeholder for non-embeddings build (maintains API compatibility)
69    #[cfg(not(feature = "embeddings"))]
70    _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74    /// Create a new semantic analyzer
75    pub fn new() -> Self {
76        Self {
77            #[cfg(feature = "embeddings")]
78            model_path: None,
79            #[cfg(not(feature = "embeddings"))]
80            _model_path: None,
81        }
82    }
83
84    /// Create a semantic analyzer with a custom model path
85    ///
86    /// The model path is used when the `embeddings` feature is enabled.
87    /// Without the feature, the path is stored but not used.
88    pub fn with_model(model_path: &str) -> Self {
89        Self {
90            #[cfg(feature = "embeddings")]
91            model_path: Some(model_path.to_owned()),
92            #[cfg(not(feature = "embeddings"))]
93            _model_path: Some(model_path.to_owned()),
94        }
95    }
96
97    /// Get the configured model path (if any)
98    #[cfg(feature = "embeddings")]
99    pub fn model_path(&self) -> Option<&str> {
100        self.model_path.as_deref()
101    }
102
103    /// Generate embeddings for code content
104    ///
105    /// # Current Implementation
106    ///
107    /// Uses a character-frequency based embedding algorithm that:
108    /// 1. Creates a 384-dimensional vector (matching common transformer output size)
109    /// 2. Accumulates weighted character frequencies based on position
110    /// 3. Normalizes to unit length for cosine similarity
111    ///
112    /// This is a **lightweight placeholder** that provides reasonable similarity
113    /// estimates for code without requiring ML model dependencies. It captures:
114    /// - Character distribution patterns
115    /// - Position-weighted frequency (earlier chars weighted more)
116    /// - Basic structural patterns through punctuation distribution
117    ///
118    /// For production use cases requiring high accuracy, consider integrating
119    /// actual transformer embeddings.
120    #[cfg(feature = "embeddings")]
121    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122        // Character-frequency based embedding (see doc comment for rationale)
123        let mut embedding = vec![0.0f32; 384];
124        for (i, c) in content.chars().enumerate() {
125            let idx = (c as usize) % 384;
126            // Position-weighted contribution: earlier characters contribute more
127            embedding[idx] += 1.0 / ((i + 1) as f32);
128        }
129        // L2 normalize for cosine similarity
130        let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131        if norm > 0.0 {
132            for x in &mut embedding {
133                *x /= norm;
134            }
135        }
136        Ok(embedding)
137    }
138
139    /// Generate embeddings (stub when feature disabled)
140    #[cfg(not(feature = "embeddings"))]
141    pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142        Ok(vec![0.0; 384])
143    }
144
145    /// Calculate similarity between two code snippets
146    #[cfg(feature = "embeddings")]
147    pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148        let emb_a = self.embed(a)?;
149        let emb_b = self.embed(b)?;
150        Ok(cosine_similarity(&emb_a, &emb_b))
151    }
152
153    /// Calculate similarity (stub when feature disabled)
154    #[cfg(not(feature = "embeddings"))]
155    pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156        Ok(0.0)
157    }
158}
159
160impl Default for SemanticAnalyzer {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166// ============================================================================
167// Semantic Compressor (for reducing content while preserving meaning)
168// ============================================================================
169
170/// Configuration for semantic compression
171#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173    /// Similarity threshold for clustering (0.0 - 1.0)
174    pub similarity_threshold: f32,
175    /// Minimum chunk size in characters
176    pub min_chunk_size: usize,
177    /// Maximum chunk size in characters
178    pub max_chunk_size: usize,
179    /// Budget ratio (0.0 - 1.0) - target size relative to original
180    pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184    fn default() -> Self {
185        Self {
186            similarity_threshold: 0.7,
187            min_chunk_size: 100,
188            max_chunk_size: 2000,
189            budget_ratio: 0.5,
190        }
191    }
192}
193
194/// A chunk of code
195#[derive(Debug, Clone)]
196pub struct CodeChunk {
197    /// The original content
198    pub content: String,
199    /// Start offset in original content
200    pub start: usize,
201    /// End offset in original content
202    pub end: usize,
203    /// Embedding vector (when computed)
204    pub embedding: Option<Vec<f32>>,
205    /// Cluster assignment
206    pub cluster_id: Option<usize>,
207}
208
209/// Semantic compressor for code content
210///
211/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
212/// otherwise falls back to heuristic-based compression.
213pub struct SemanticCompressor {
214    config: SemanticConfig,
215    /// Semantic analyzer for generating embeddings and computing similarity
216    analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220    /// Create a new semantic compressor with default config
221    pub fn new() -> Self {
222        Self::with_config(SemanticConfig::default())
223    }
224
225    /// Create a new semantic compressor with custom config
226    pub fn with_config(config: SemanticConfig) -> Self {
227        Self { config, analyzer: SemanticAnalyzer::new() }
228    }
229
230    /// Get a reference to the internal semantic analyzer
231    ///
232    /// This allows access to the analyzer for similarity computations
233    /// or custom embedding operations.
234    pub fn analyzer(&self) -> &SemanticAnalyzer {
235        &self.analyzer
236    }
237
238    /// Compress content semantically
239    ///
240    /// When the `embeddings` feature is enabled, uses neural embeddings
241    /// to cluster similar code chunks and select representatives.
242    ///
243    /// Without the feature, falls back to heuristic-based compression.
244    pub fn compress(&self, content: &str) -> Result<String> {
245        // First, check for repetitive content (Bug #6 fix)
246        if let Some(compressed) = self.compress_repetitive(content) {
247            return Ok(compressed);
248        }
249
250        #[cfg(feature = "embeddings")]
251        {
252            return self.compress_with_embeddings(content);
253        }
254
255        #[cfg(not(feature = "embeddings"))]
256        {
257            self.compress_heuristic(content)
258        }
259    }
260
261    /// Detect and compress repetitive content (Bug #6 fix)
262    ///
263    /// Handles cases like "sentence ".repeat(500) by detecting the repeated pattern
264    /// and returning a compressed representation.
265    ///
266    /// This function is UTF-8 safe - it only slices at valid character boundaries.
267    fn compress_repetitive(&self, content: &str) -> Option<String> {
268        // Only process content above a minimum threshold
269        if content.len() < 200 {
270            return None;
271        }
272
273        // Try to find a repeating pattern
274        // Start with small patterns and work up
275        // We iterate byte positions but only consider those that are valid UTF-8 boundaries
276        for pattern_len in 1..=100.min(content.len() / 3) {
277            // Skip if this byte position is not a valid UTF-8 character boundary
278            if !content.is_char_boundary(pattern_len) {
279                continue;
280            }
281
282            let pattern = &content[..pattern_len];
283
284            // Skip patterns that are just whitespace
285            if pattern.chars().all(|c| c.is_whitespace()) {
286                continue;
287            }
288
289            // Count how many times this pattern repeats consecutively
290            let mut count = 0;
291            let mut pos = 0;
292            while pos + pattern_len <= content.len() {
293                // Ensure both slice boundaries are valid UTF-8
294                if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295                    break;
296                }
297                if &content[pos..pos + pattern_len] == pattern {
298                    count += 1;
299                    pos += pattern_len;
300                } else {
301                    break;
302                }
303            }
304
305            // If pattern repeats enough times and covers most of the content
306            let coverage = (count * pattern_len) as f32 / content.len() as f32;
307            if count >= 3 && coverage >= 0.8 {
308                // Calculate how many instances to keep based on budget_ratio
309                let instances_to_show = (count as f32 * self.config.budget_ratio)
310                    .ceil()
311                    .clamp(1.0, 5.0) as usize;
312
313                let shown_content = pattern.repeat(instances_to_show);
314                // Safe: count * pattern_len is already at a valid boundary (start of next pattern or end)
315                let remainder_start = count * pattern_len;
316                let remainder = if remainder_start <= content.len()
317                    && content.is_char_boundary(remainder_start)
318                {
319                    &content[remainder_start..]
320                } else {
321                    ""
322                };
323
324                let result = if remainder.is_empty() {
325                    format!(
326                        "{}\n/* ... pattern repeated {} times (showing {}) ... */",
327                        shown_content.trim_end(),
328                        count,
329                        instances_to_show
330                    )
331                } else {
332                    format!(
333                        "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
334                        shown_content.trim_end(),
335                        count,
336                        instances_to_show,
337                        remainder.trim()
338                    )
339                };
340
341                return Some(result);
342            }
343        }
344
345        // Also detect line-based repetition (same line repeated many times)
346        let lines: Vec<&str> = content.lines().collect();
347        if lines.len() >= 3 {
348            let mut line_counts: std::collections::HashMap<&str, usize> =
349                std::collections::HashMap::new();
350            for line in &lines {
351                *line_counts.entry(*line).or_insert(0) += 1;
352            }
353
354            // Find the most repeated line
355            if let Some((repeated_line, count)) = line_counts
356                .iter()
357                .filter(|(line, _)| !line.trim().is_empty())
358                .max_by_key(|(_, count)| *count)
359            {
360                let repetition_ratio = *count as f32 / lines.len() as f32;
361                if *count >= 3 && repetition_ratio >= 0.5 {
362                    // Build compressed output preserving unique lines
363                    let mut result = String::new();
364                    let mut consecutive_count = 0;
365                    let mut last_was_repeated = false;
366
367                    for line in &lines {
368                        if *line == *repeated_line {
369                            consecutive_count += 1;
370                            if !last_was_repeated {
371                                if !result.is_empty() {
372                                    result.push('\n');
373                                }
374                                result.push_str(line);
375                            }
376                            last_was_repeated = true;
377                        } else {
378                            if last_was_repeated && consecutive_count > 1 {
379                                result.push_str(&format!(
380                                    "\n/* ... above line repeated {} times ... */",
381                                    consecutive_count
382                                ));
383                            }
384                            consecutive_count = 0;
385                            last_was_repeated = false;
386                            if !result.is_empty() {
387                                result.push('\n');
388                            }
389                            result.push_str(line);
390                        }
391                    }
392
393                    if last_was_repeated && consecutive_count > 1 {
394                        result.push_str(&format!(
395                            "\n/* ... above line repeated {} times ... */",
396                            consecutive_count
397                        ));
398                    }
399
400                    // Only return if we actually compressed significantly
401                    if result.len() < content.len() / 2 {
402                        return Some(result);
403                    }
404                }
405            }
406        }
407
408        None
409    }
410
411    /// Split content into semantic chunks (Bug #6 fix - handles content without \n\n)
412    fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
413        let mut chunks = Vec::new();
414        let mut current_start = 0;
415
416        // First try: Split on double newlines (paragraph-like boundaries)
417        for (i, _) in content.match_indices("\n\n") {
418            if i > current_start && i - current_start >= self.config.min_chunk_size {
419                let chunk_content = &content[current_start..i];
420                if chunk_content.len() <= self.config.max_chunk_size {
421                    chunks.push(CodeChunk {
422                        content: chunk_content.to_owned(),
423                        start: current_start,
424                        end: i,
425                        embedding: None,
426                        cluster_id: None,
427                    });
428                }
429                current_start = i + 2;
430            }
431        }
432
433        // Handle remaining content
434        if current_start < content.len() {
435            let remaining = &content[current_start..];
436            if remaining.len() >= self.config.min_chunk_size {
437                chunks.push(CodeChunk {
438                    content: remaining.to_owned(),
439                    start: current_start,
440                    end: content.len(),
441                    embedding: None,
442                    cluster_id: None,
443                });
444            }
445        }
446
447        // Fallback: If no chunks found (no \n\n separators), try single newlines
448        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
449            current_start = 0;
450            for (i, _) in content.match_indices('\n') {
451                if i > current_start && i - current_start >= self.config.min_chunk_size {
452                    let chunk_content = &content[current_start..i];
453                    if chunk_content.len() <= self.config.max_chunk_size {
454                        chunks.push(CodeChunk {
455                            content: chunk_content.to_owned(),
456                            start: current_start,
457                            end: i,
458                            embedding: None,
459                            cluster_id: None,
460                        });
461                    }
462                    current_start = i + 1;
463                }
464            }
465            // Handle remaining after single newline split
466            if current_start < content.len() {
467                let remaining = &content[current_start..];
468                if remaining.len() >= self.config.min_chunk_size {
469                    chunks.push(CodeChunk {
470                        content: remaining.to_owned(),
471                        start: current_start,
472                        end: content.len(),
473                        embedding: None,
474                        cluster_id: None,
475                    });
476                }
477            }
478        }
479
480        // Second fallback: If still no chunks, split by sentence boundaries (. followed by space)
481        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
482            current_start = 0;
483            for (i, _) in content.match_indices(". ") {
484                if i > current_start && i - current_start >= self.config.min_chunk_size {
485                    let chunk_content = &content[current_start..=i]; // include the period
486                    if chunk_content.len() <= self.config.max_chunk_size {
487                        chunks.push(CodeChunk {
488                            content: chunk_content.to_owned(),
489                            start: current_start,
490                            end: i + 1,
491                            embedding: None,
492                            cluster_id: None,
493                        });
494                    }
495                    current_start = i + 2;
496                }
497            }
498            // Handle remaining
499            if current_start < content.len() {
500                let remaining = &content[current_start..];
501                if remaining.len() >= self.config.min_chunk_size {
502                    chunks.push(CodeChunk {
503                        content: remaining.to_owned(),
504                        start: current_start,
505                        end: content.len(),
506                        embedding: None,
507                        cluster_id: None,
508                    });
509                }
510            }
511        }
512
513        // Final fallback: If content is large but can't be split, force split by max_chunk_size
514        if chunks.is_empty() && content.len() > self.config.max_chunk_size {
515            let mut pos = 0;
516            while pos < content.len() {
517                let end = (pos + self.config.max_chunk_size).min(content.len());
518                chunks.push(CodeChunk {
519                    content: content[pos..end].to_owned(),
520                    start: pos,
521                    end,
522                    embedding: None,
523                    cluster_id: None,
524                });
525                pos = end;
526            }
527        }
528
529        chunks
530    }
531
532    /// Compress using heuristic methods (fallback when embeddings unavailable)
533    ///
534    /// Bug #4 fix: Make budget_ratio more effective for all content types
535    fn compress_heuristic(&self, content: &str) -> Result<String> {
536        let chunks = self.split_into_chunks(content);
537
538        // Bug #4 fix: When no chunks can be created but content is large enough,
539        // apply character-level truncation based on budget_ratio.
540        // Only truncate if:
541        // 1. No chunks could be created
542        // 2. Content is larger than min_chunk_size (so small content passes through)
543        // 3. budget_ratio would actually reduce the size meaningfully
544        if chunks.is_empty() {
545            // Only apply truncation for content larger than min_chunk_size
546            // Small content should pass through unchanged
547            if content.len() > self.config.min_chunk_size && self.config.budget_ratio < 1.0 {
548                let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
549                if target_len > 0 && target_len < content.len() {
550                    // Find a safe truncation point (word/line boundary)
551                    let truncate_at = find_safe_truncation_point(content, target_len);
552                    if truncate_at < content.len() {
553                        let truncated = &content[..truncate_at];
554                        return Ok(format!(
555                            "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
556                            truncated.trim_end(),
557                            self.config.budget_ratio * 100.0,
558                            truncate_at,
559                            content.len()
560                        ));
561                    }
562                }
563            }
564            return Ok(content.to_owned());
565        }
566
567        // Keep every Nth chunk based on budget ratio
568        let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
569        let step = chunks.len() / target_chunks.max(1);
570
571        let mut result = String::new();
572        let mut kept = 0;
573
574        for (i, chunk) in chunks.iter().enumerate() {
575            if i % step.max(1) == 0 && kept < target_chunks {
576                if !result.is_empty() {
577                    result.push_str("\n\n");
578                }
579                result.push_str(&chunk.content);
580                kept += 1;
581            }
582        }
583
584        // Add truncation marker if we removed content
585        if kept < chunks.len() {
586            result.push_str(&format!(
587                "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
588                chunks.len() - kept,
589                (kept as f32 / chunks.len() as f32) * 100.0
590            ));
591        }
592
593        Ok(result)
594    }
595
596    /// Compress using neural embeddings
597    #[cfg(feature = "embeddings")]
598    fn compress_with_embeddings(&self, content: &str) -> Result<String> {
599        let mut chunks = self.split_into_chunks(content);
600
601        if chunks.is_empty() {
602            return Ok(content.to_owned());
603        }
604
605        // Generate embeddings for each chunk
606        for chunk in &mut chunks {
607            chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
608        }
609
610        // Cluster similar chunks
611        let clusters = self.cluster_chunks(&chunks)?;
612
613        // Select representative from each cluster
614        let mut result = String::new();
615        for cluster in clusters.values() {
616            if let Some(representative) = self.select_representative(cluster) {
617                if !result.is_empty() {
618                    result.push_str("\n\n");
619                }
620                result.push_str(&representative.content);
621            }
622        }
623
624        Ok(result)
625    }
626
627    /// Cluster chunks by embedding similarity
628    #[cfg(feature = "embeddings")]
629    fn cluster_chunks<'a>(
630        &self,
631        chunks: &'a [CodeChunk],
632    ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
633        let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
634        let mut next_cluster = 0;
635
636        for chunk in chunks {
637            let embedding = chunk
638                .embedding
639                .as_ref()
640                .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
641
642            // Find existing cluster with similar embedding
643            let mut target_cluster = None;
644            for (&cluster_id, cluster_chunks) in &clusters {
645                if let Some(first) = cluster_chunks.first() {
646                    if let Some(ref first_emb) = first.embedding {
647                        let similarity = cosine_similarity(embedding, first_emb);
648                        if similarity >= self.config.similarity_threshold {
649                            target_cluster = Some(cluster_id);
650                            break;
651                        }
652                    }
653                }
654            }
655
656            if let Some(cluster_id) = target_cluster {
657                if let Some(cluster) = clusters.get_mut(&cluster_id) {
658                    cluster.push(chunk);
659                }
660            } else {
661                clusters.insert(next_cluster, vec![chunk]);
662                next_cluster += 1;
663            }
664        }
665
666        Ok(clusters)
667    }
668
669    /// Select the best representative from a cluster
670    #[cfg(feature = "embeddings")]
671    fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
672        // Select the longest chunk as representative (most informative)
673        chunks.iter().max_by_key(|c| c.content.len()).copied()
674    }
675}
676
677impl Default for SemanticCompressor {
678    fn default() -> Self {
679        Self::new()
680    }
681}
682
683// ============================================================================
684// Honest Type Aliases
685// ============================================================================
686// The names below more accurately describe the implementation:
687// - "Semantic" implies neural/ML understanding, but we use heuristics
688// - These aliases are provided for clarity and recommended for new code
689
690/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
691///
692/// This analyzer uses character-frequency heuristics for similarity detection,
693/// NOT neural network embeddings. Use this alias when you want to be explicit
694/// about the implementation approach.
695pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
696
697/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
698///
699/// This compressor uses chunk-based heuristics with optional character-frequency
700/// clustering, NOT neural semantic understanding. Use this alias when you want
701/// to be explicit about the implementation approach.
702pub type HeuristicCompressor = SemanticCompressor;
703
704/// Alias for `SemanticConfig` - more honest name.
705pub type HeuristicCompressionConfig = SemanticConfig;
706
707// ============================================================================
708// Utility Functions
709// ============================================================================
710
711/// Find a safe truncation point in content (word or line boundary)
712///
713/// Used by compress_heuristic to ensure we don't cut in the middle of a word
714/// or multi-byte UTF-8 character.
715fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
716    if target_len >= content.len() {
717        return content.len();
718    }
719
720    // First, ensure we're at a valid UTF-8 boundary
721    let mut truncate_at = target_len;
722    while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
723        truncate_at -= 1;
724    }
725
726    // Try to find a line boundary (newline) near the target
727    if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
728        if newline_pos > target_len / 2 {
729            // Found a newline that's not too far back
730            return newline_pos;
731        }
732    }
733
734    // Fall back to word boundary (space)
735    if let Some(space_pos) = content[..truncate_at].rfind(' ') {
736        if space_pos > target_len / 2 {
737            return space_pos;
738        }
739    }
740
741    // No good boundary found, use the UTF-8 safe position
742    truncate_at
743}
744
745/// Compute cosine similarity between two vectors
746///
747/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
748/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
749/// opposite direction.
750///
751/// # Note
752/// This function is used by the embeddings feature for clustering and
753/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
754/// builds without the embeddings feature.
755#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
756fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
757    if a.len() != b.len() || a.is_empty() {
758        return 0.0;
759    }
760
761    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
762    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
763    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
764
765    if norm_a == 0.0 || norm_b == 0.0 {
766        return 0.0;
767    }
768
769    dot / (norm_a * norm_b)
770}
771
772// ============================================================================
773// Tests
774// ============================================================================
775
776#[cfg(test)]
777mod tests {
778    use super::*;
779
780    #[test]
781    fn test_analyzer_creation() {
782        let analyzer = SemanticAnalyzer::new();
783        // Verify analyzer is created successfully
784        // Model path is None by default (accessed via model_path() when embeddings enabled)
785        #[cfg(feature = "embeddings")]
786        assert!(analyzer.model_path().is_none());
787        #[cfg(not(feature = "embeddings"))]
788        drop(analyzer); // Explicitly drop to satisfy lint
789    }
790
791    #[test]
792    fn test_analyzer_with_model() {
793        let analyzer = SemanticAnalyzer::with_model("/path/to/model");
794        #[cfg(feature = "embeddings")]
795        assert_eq!(analyzer.model_path(), Some("/path/to/model"));
796        #[cfg(not(feature = "embeddings"))]
797        drop(analyzer); // Explicitly drop to satisfy lint
798    }
799
800    #[test]
801    fn test_compressor_analyzer_access() {
802        let compressor = SemanticCompressor::new();
803        // Verify we can access the analyzer through the compressor
804        let _analyzer = compressor.analyzer();
805    }
806
807    #[test]
808    fn test_semantic_config_default() {
809        let config = SemanticConfig::default();
810        assert_eq!(config.similarity_threshold, 0.7);
811        assert_eq!(config.budget_ratio, 0.5);
812    }
813
814    #[test]
815    fn test_split_into_chunks() {
816        let compressor = SemanticCompressor::with_config(SemanticConfig {
817            min_chunk_size: 10,
818            max_chunk_size: 1000,
819            ..Default::default()
820        });
821
822        let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
823        let chunks = compressor.split_into_chunks(content);
824        assert!(chunks.len() >= 2);
825    }
826
827    #[test]
828    fn test_heuristic_compression() {
829        let compressor = SemanticCompressor::with_config(SemanticConfig {
830            min_chunk_size: 5,
831            max_chunk_size: 100,
832            budget_ratio: 0.5,
833            ..Default::default()
834        });
835
836        let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
837        let result = compressor.compress_heuristic(content).unwrap();
838        // Should complete without error
839        assert!(!result.is_empty() || content.is_empty());
840    }
841
842    #[test]
843    fn test_empty_content() {
844        let compressor = SemanticCompressor::new();
845        let result = compressor.compress("").unwrap();
846        assert_eq!(result, "");
847    }
848
849    #[test]
850    fn test_cosine_similarity_identical() {
851        let a = vec![1.0, 0.0, 0.0];
852        let b = vec![1.0, 0.0, 0.0];
853        let sim = cosine_similarity(&a, &b);
854        assert!((sim - 1.0).abs() < 0.001);
855    }
856
857    #[test]
858    fn test_cosine_similarity_orthogonal() {
859        let a = vec![1.0, 0.0, 0.0];
860        let c = vec![0.0, 1.0, 0.0];
861        let sim = cosine_similarity(&a, &c);
862        assert!(sim.abs() < 0.001);
863    }
864
865    #[test]
866    fn test_cosine_similarity_empty() {
867        let a: Vec<f32> = vec![];
868        let b: Vec<f32> = vec![];
869        assert_eq!(cosine_similarity(&a, &b), 0.0);
870    }
871
872    // Bug #6 tests - repetitive content compression
873    #[test]
874    fn test_repetitive_pattern_compression() {
875        let compressor = SemanticCompressor::new();
876        // Test "sentence ".repeat(500) - exactly the reported bug case
877        let content = "sentence ".repeat(500);
878        let result = compressor.compress(&content).unwrap();
879
880        // Result should be significantly smaller than original
881        assert!(
882            result.len() < content.len() / 2,
883            "Compressed size {} should be less than half of original {}",
884            result.len(),
885            content.len()
886        );
887
888        // Should contain the pattern and a compression marker
889        assert!(result.contains("sentence"));
890        assert!(
891            result.contains("repeated") || result.contains("pattern"),
892            "Should indicate compression occurred"
893        );
894    }
895
896    #[test]
897    fn test_repetitive_line_compression() {
898        let compressor = SemanticCompressor::new();
899        // Test repeated lines
900        let content = "same line\n".repeat(100);
901        let result = compressor.compress(&content).unwrap();
902
903        // Result should be significantly smaller
904        assert!(
905            result.len() < content.len() / 2,
906            "Compressed size {} should be less than half of original {}",
907            result.len(),
908            content.len()
909        );
910    }
911
912    #[test]
913    fn test_non_repetitive_content_unchanged() {
914        let compressor = SemanticCompressor::new();
915        // Non-repetitive content should not trigger repetition compression
916        let content = "This is some unique content that does not repeat.";
917        let result = compressor.compress(content).unwrap();
918
919        // Short non-repetitive content should be returned as-is
920        assert_eq!(result, content);
921    }
922
923    #[test]
924    fn test_repetitive_with_variation() {
925        let compressor = SemanticCompressor::with_config(SemanticConfig {
926            budget_ratio: 0.3,
927            ..Default::default()
928        });
929
930        // Content with some repetition mixed with unique parts
931        let mut content = String::new();
932        for i in 0..50 {
933            content.push_str(&format!("item {} ", i % 5)); // Repeated pattern with variation
934        }
935
936        let result = compressor.compress(&content).unwrap();
937        // This may or may not compress depending on pattern detection
938        // Just verify it doesn't panic
939        assert!(!result.is_empty());
940    }
941
942    // UTF-8 boundary safety tests for compress_repetitive
943    #[test]
944    fn test_repetitive_unicode_chinese() {
945        let compressor = SemanticCompressor::new();
946        // Chinese characters are 3 bytes each
947        // Create repeating Chinese pattern
948        let content = "ไธญๆ–‡ๆต‹่ฏ• ".repeat(100); // Each repeat is 13 bytes
949        let result = compressor.compress(&content).unwrap();
950
951        // Should not panic and should produce valid UTF-8
952        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
953
954        // Should compress or return unchanged (not panic)
955        assert!(!result.is_empty() || content.is_empty());
956    }
957
958    #[test]
959    fn test_repetitive_unicode_emoji() {
960        let compressor = SemanticCompressor::new();
961        // Emoji are 4 bytes each
962        let content = "๐ŸŽ‰๐ŸŽŠ๐ŸŽ ".repeat(80); // Each repeat is 14 bytes
963
964        let result = compressor.compress(&content).unwrap();
965        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
966        assert!(!result.is_empty() || content.is_empty());
967    }
968
969    #[test]
970    fn test_repetitive_unicode_mixed() {
971        let compressor = SemanticCompressor::new();
972        // Mix of 1, 2, 3, and 4 byte characters
973        let content = "aไธญ๐ŸŽ‰ ".repeat(60); // Each repeat is 11 bytes
974
975        let result = compressor.compress(&content).unwrap();
976        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
977        assert!(!result.is_empty() || content.is_empty());
978    }
979
980    #[test]
981    fn test_repetitive_unicode_cyrillic() {
982        let compressor = SemanticCompressor::new();
983        // Cyrillic characters are 2 bytes each
984        let content = "ะŸั€ะธะฒะตั‚ ".repeat(50);
985
986        let result = compressor.compress(&content).unwrap();
987        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
988    }
989
990    #[test]
991    fn test_non_repetitive_unicode_boundary() {
992        let compressor = SemanticCompressor::new();
993        // Content where pattern detection would try various byte lengths
994        // that don't align with UTF-8 boundaries
995        let content = "ไธ–็•Œๅ’Œๅนณ".repeat(60); // No spaces, pure multi-byte
996
997        let result = compressor.compress(&content).unwrap();
998        // Should not panic even when pattern length iteration
999        // hits non-UTF-8 boundaries
1000        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1001    }
1002
1003    #[test]
1004    fn test_repetitive_unicode_line_based() {
1005        let compressor = SemanticCompressor::new();
1006        // Test line-based repetition detection with Unicode
1007        let content = "ไธญๆ–‡่กŒ\n".repeat(100);
1008
1009        let result = compressor.compress(&content).unwrap();
1010        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1011    }
1012
1013    // ==========================================================================
1014    // Additional coverage tests
1015    // ==========================================================================
1016
1017    #[test]
1018    fn test_semantic_error_display() {
1019        let err1 = SemanticError::ModelLoadError("test error".to_string());
1020        assert!(err1.to_string().contains("Model loading failed"));
1021        assert!(err1.to_string().contains("test error"));
1022
1023        let err2 = SemanticError::EmbeddingError("embed fail".to_string());
1024        assert!(err2.to_string().contains("Embedding generation failed"));
1025
1026        let err3 = SemanticError::ClusteringError("cluster fail".to_string());
1027        assert!(err3.to_string().contains("Clustering failed"));
1028
1029        let err4 = SemanticError::FeatureNotEnabled;
1030        assert!(err4.to_string().contains("embeddings feature not enabled"));
1031    }
1032
1033    #[test]
1034    fn test_semantic_error_debug() {
1035        let err = SemanticError::ModelLoadError("debug test".to_string());
1036        let debug_str = format!("{:?}", err);
1037        assert!(debug_str.contains("ModelLoadError"));
1038    }
1039
1040    #[test]
1041    fn test_semantic_analyzer_default() {
1042        let analyzer = SemanticAnalyzer::default();
1043        // Should work same as new()
1044        let result = analyzer.embed("test");
1045        assert!(result.is_ok());
1046    }
1047
1048    #[test]
1049    fn test_semantic_analyzer_debug() {
1050        let analyzer = SemanticAnalyzer::new();
1051        let debug_str = format!("{:?}", analyzer);
1052        assert!(debug_str.contains("SemanticAnalyzer"));
1053    }
1054
1055    #[test]
1056    fn test_semantic_analyzer_embed_empty() {
1057        let analyzer = SemanticAnalyzer::new();
1058        let result = analyzer.embed("").unwrap();
1059        assert_eq!(result.len(), 384);
1060    }
1061
1062    #[test]
1063    fn test_semantic_analyzer_embed_produces_384_dims() {
1064        let analyzer = SemanticAnalyzer::new();
1065        let result = analyzer.embed("some code content").unwrap();
1066        assert_eq!(result.len(), 384);
1067    }
1068
1069    #[test]
1070    fn test_semantic_analyzer_similarity_same_content() {
1071        let analyzer = SemanticAnalyzer::new();
1072        let result = analyzer.similarity("hello world", "hello world").unwrap();
1073        // Same content should have high similarity (1.0 in embeddings mode, 0.0 in fallback)
1074        #[cfg(feature = "embeddings")]
1075        assert!((result - 1.0).abs() < 0.01);
1076        #[cfg(not(feature = "embeddings"))]
1077        assert_eq!(result, 0.0);
1078    }
1079
1080    #[test]
1081    fn test_semantic_analyzer_similarity_different_content() {
1082        let analyzer = SemanticAnalyzer::new();
1083        let result = analyzer.similarity("hello", "goodbye").unwrap();
1084        // Result should be valid (0.0 in fallback mode)
1085        #[cfg(not(feature = "embeddings"))]
1086        assert_eq!(result, 0.0);
1087        #[cfg(feature = "embeddings")]
1088        assert!(result >= -1.0 && result <= 1.0);
1089    }
1090
1091    #[test]
1092    fn test_semantic_config_custom() {
1093        let config = SemanticConfig {
1094            similarity_threshold: 0.9,
1095            min_chunk_size: 50,
1096            max_chunk_size: 5000,
1097            budget_ratio: 0.3,
1098        };
1099        assert_eq!(config.similarity_threshold, 0.9);
1100        assert_eq!(config.min_chunk_size, 50);
1101        assert_eq!(config.max_chunk_size, 5000);
1102        assert_eq!(config.budget_ratio, 0.3);
1103    }
1104
1105    #[test]
1106    fn test_semantic_config_clone() {
1107        let config = SemanticConfig::default();
1108        let cloned = config.clone();
1109        assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1110        assert_eq!(cloned.budget_ratio, config.budget_ratio);
1111    }
1112
1113    #[test]
1114    fn test_semantic_config_debug() {
1115        let config = SemanticConfig::default();
1116        let debug_str = format!("{:?}", config);
1117        assert!(debug_str.contains("SemanticConfig"));
1118        assert!(debug_str.contains("similarity_threshold"));
1119    }
1120
1121    #[test]
1122    fn test_code_chunk_debug() {
1123        let chunk = CodeChunk {
1124            content: "test content".to_string(),
1125            start: 0,
1126            end: 12,
1127            embedding: None,
1128            cluster_id: None,
1129        };
1130        let debug_str = format!("{:?}", chunk);
1131        assert!(debug_str.contains("CodeChunk"));
1132        assert!(debug_str.contains("test content"));
1133    }
1134
1135    #[test]
1136    fn test_code_chunk_clone() {
1137        let chunk = CodeChunk {
1138            content: "original".to_string(),
1139            start: 0,
1140            end: 8,
1141            embedding: Some(vec![0.1, 0.2, 0.3]),
1142            cluster_id: Some(5),
1143        };
1144        let cloned = chunk.clone();
1145        assert_eq!(cloned.content, "original");
1146        assert_eq!(cloned.start, 0);
1147        assert_eq!(cloned.end, 8);
1148        assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1149        assert_eq!(cloned.cluster_id, Some(5));
1150    }
1151
1152    #[test]
1153    fn test_semantic_compressor_default() {
1154        let compressor = SemanticCompressor::default();
1155        let result = compressor.compress("test").unwrap();
1156        assert_eq!(result, "test");
1157    }
1158
1159    #[test]
1160    fn test_split_into_chunks_single_newline_fallback() {
1161        let compressor = SemanticCompressor::with_config(SemanticConfig {
1162            min_chunk_size: 5,
1163            max_chunk_size: 1000,
1164            ..Default::default()
1165        });
1166
1167        // Content with only single newlines (no \n\n)
1168        let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1169        let chunks = compressor.split_into_chunks(content);
1170        // Should use single newline fallback
1171        assert!(!chunks.is_empty() || content.len() < 5);
1172    }
1173
1174    #[test]
1175    fn test_split_into_chunks_sentence_fallback() {
1176        let compressor = SemanticCompressor::with_config(SemanticConfig {
1177            min_chunk_size: 10,
1178            max_chunk_size: 1000,
1179            ..Default::default()
1180        });
1181
1182        // Content with sentences but no newlines
1183        let content = "First sentence here. Second sentence here. Third sentence here.";
1184        let chunks = compressor.split_into_chunks(content);
1185        // Should use sentence boundary fallback
1186        assert!(!chunks.is_empty() || content.len() < 10);
1187    }
1188
1189    #[test]
1190    fn test_split_into_chunks_force_split() {
1191        let compressor = SemanticCompressor::with_config(SemanticConfig {
1192            min_chunk_size: 100, // Higher than content length so normal chunking fails
1193            max_chunk_size: 20,  // Lower than content length to trigger force split
1194            ..Default::default()
1195        });
1196
1197        // Content without any splitting characters, longer than max_chunk_size
1198        // but shorter than min_chunk_size (so normal chunking produces empty result)
1199        let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1200        let chunks = compressor.split_into_chunks(content);
1201        // Should force split by max_chunk_size when no other splitting works
1202        assert!(chunks.len() >= 2, "Expected at least 2 chunks from force split, got {}", chunks.len());
1203    }
1204
1205    #[test]
1206    fn test_split_into_chunks_empty() {
1207        let compressor = SemanticCompressor::new();
1208        let chunks = compressor.split_into_chunks("");
1209        assert!(chunks.is_empty());
1210    }
1211
1212    #[test]
1213    fn test_split_into_chunks_below_min_size() {
1214        let compressor = SemanticCompressor::with_config(SemanticConfig {
1215            min_chunk_size: 100,
1216            max_chunk_size: 1000,
1217            ..Default::default()
1218        });
1219
1220        let content = "short";
1221        let chunks = compressor.split_into_chunks(content);
1222        // Content too short for min_chunk_size
1223        assert!(chunks.is_empty());
1224    }
1225
1226    #[test]
1227    fn test_compress_heuristic_empty_chunks() {
1228        let compressor = SemanticCompressor::with_config(SemanticConfig {
1229            min_chunk_size: 1000, // Force no chunks to be created
1230            ..Default::default()
1231        });
1232
1233        let content = "short content";
1234        let result = compressor.compress_heuristic(content).unwrap();
1235        // Should return original when no chunks created
1236        assert_eq!(result, content);
1237    }
1238
1239    #[test]
1240    fn test_compress_heuristic_multiple_chunks() {
1241        let compressor = SemanticCompressor::with_config(SemanticConfig {
1242            min_chunk_size: 10,
1243            max_chunk_size: 100,
1244            budget_ratio: 0.3,
1245            ..Default::default()
1246        });
1247
1248        let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1249        let result = compressor.compress_heuristic(content).unwrap();
1250        // Should have compression marker if chunks were removed
1251        assert!(result.contains("chunk") || result.contains("compressed"));
1252    }
1253
1254    #[test]
1255    fn test_cosine_similarity_different_lengths() {
1256        let a = vec![1.0, 2.0, 3.0];
1257        let b = vec![1.0, 2.0];
1258        let sim = cosine_similarity(&a, &b);
1259        assert_eq!(sim, 0.0); // Different lengths should return 0
1260    }
1261
1262    #[test]
1263    fn test_cosine_similarity_zero_vectors() {
1264        let a = vec![0.0, 0.0, 0.0];
1265        let b = vec![1.0, 2.0, 3.0];
1266        let sim = cosine_similarity(&a, &b);
1267        assert_eq!(sim, 0.0); // Zero norm should return 0
1268    }
1269
1270    #[test]
1271    fn test_cosine_similarity_opposite() {
1272        let a = vec![1.0, 0.0, 0.0];
1273        let b = vec![-1.0, 0.0, 0.0];
1274        let sim = cosine_similarity(&a, &b);
1275        assert!((sim + 1.0).abs() < 0.001); // Opposite directions = -1.0
1276    }
1277
1278    #[test]
1279    fn test_cosine_similarity_normalized() {
1280        let a = vec![0.6, 0.8, 0.0];
1281        let b = vec![0.6, 0.8, 0.0];
1282        let sim = cosine_similarity(&a, &b);
1283        assert!((sim - 1.0).abs() < 0.001);
1284    }
1285
1286    #[test]
1287    fn test_compress_repetitive_short_content() {
1288        let compressor = SemanticCompressor::new();
1289        // Content below 200 chars should not trigger repetition compression
1290        let content = "short ".repeat(10); // 60 chars
1291        let result = compressor.compress_repetitive(&content);
1292        assert!(result.is_none());
1293    }
1294
1295    #[test]
1296    fn test_compress_repetitive_whitespace_only() {
1297        let compressor = SemanticCompressor::new();
1298        // Whitespace-only patterns should be skipped
1299        let content = "   ".repeat(100);
1300        let result = compressor.compress_repetitive(&content);
1301        // Should not compress whitespace-only patterns
1302        assert!(result.is_none());
1303    }
1304
1305    #[test]
1306    fn test_compress_repetitive_low_coverage() {
1307        let compressor = SemanticCompressor::new();
1308        // Pattern that doesn't cover 80% of content
1309        let mut content = "pattern ".repeat(5);
1310        content.push_str(&"x".repeat(200)); // Add non-repeating content
1311        let result = compressor.compress_repetitive(&content);
1312        // Low coverage should not trigger compression
1313        assert!(result.is_none());
1314    }
1315
1316    #[test]
1317    fn test_compress_repetitive_line_low_ratio() {
1318        let compressor = SemanticCompressor::new();
1319        // Lines where no single line repeats enough
1320        let content = (0..20).map(|i| format!("unique line {}", i)).collect::<Vec<_>>().join("\n");
1321        let result = compressor.compress_repetitive(&content);
1322        // No significant repetition
1323        assert!(result.is_none());
1324    }
1325
1326    #[test]
1327    fn test_compress_repetitive_mixed_with_unique() {
1328        let compressor = SemanticCompressor::new();
1329        // Repeated line mixed with unique lines
1330        let mut lines = vec![];
1331        for i in 0..50 {
1332            if i % 2 == 0 {
1333                lines.push("repeated line");
1334            } else {
1335                lines.push("unique line");
1336            }
1337        }
1338        let content = lines.join("\n");
1339        let result = compressor.compress(&content).unwrap();
1340        // Should handle mixed content
1341        assert!(!result.is_empty());
1342    }
1343
1344    #[test]
1345    fn test_compress_no_repetition_returns_none() {
1346        let compressor = SemanticCompressor::new();
1347        // Unique content that doesn't repeat
1348        let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1349        // Each sentence is unique enough
1350        let result = compressor.compress_repetitive(&content);
1351        // Depends on pattern length detection - may or may not find pattern
1352        // Just verify no panic
1353        drop(result);
1354    }
1355
1356    #[test]
1357    fn test_type_aliases() {
1358        // Test that type aliases work correctly
1359        let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1360        let _compressor: HeuristicCompressor = SemanticCompressor::new();
1361        let _config: HeuristicCompressionConfig = SemanticConfig::default();
1362    }
1363
1364    #[test]
1365    fn test_compress_preserves_content_structure() {
1366        let compressor = SemanticCompressor::with_config(SemanticConfig {
1367            min_chunk_size: 10,
1368            max_chunk_size: 500,
1369            budget_ratio: 1.0, // Keep everything
1370            ..Default::default()
1371        });
1372
1373        let content = "def foo():\n    pass\n\ndef bar():\n    pass";
1374        let result = compressor.compress(content).unwrap();
1375        // With budget_ratio 1.0, should keep most content
1376        assert!(result.contains("foo") || result.contains("bar"));
1377    }
1378
1379    #[test]
1380    fn test_split_chunks_respects_max_size() {
1381        let compressor = SemanticCompressor::with_config(SemanticConfig {
1382            min_chunk_size: 5,
1383            max_chunk_size: 50,
1384            ..Default::default()
1385        });
1386
1387        let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1388        let chunks = compressor.split_into_chunks(content);
1389
1390        for chunk in &chunks {
1391            assert!(
1392                chunk.content.len() <= 50,
1393                "Chunk size {} exceeds max 50",
1394                chunk.content.len()
1395            );
1396        }
1397    }
1398
1399    #[test]
1400    fn test_compress_repetitive_with_remainder() {
1401        let compressor = SemanticCompressor::new();
1402        // Pattern that repeats but has a small remainder
1403        let mut content = "abc ".repeat(100);
1404        content.push_str("xyz"); // Add non-repeating remainder
1405
1406        let result = compressor.compress(&content).unwrap();
1407        // Should compress and handle remainder
1408        assert!(!result.is_empty());
1409    }
1410
1411    #[test]
1412    fn test_compressor_analyzer_method() {
1413        let compressor = SemanticCompressor::new();
1414        let analyzer = compressor.analyzer();
1415
1416        // Verify the analyzer works
1417        let embed_result = analyzer.embed("test code");
1418        assert!(embed_result.is_ok());
1419    }
1420
1421    #[test]
1422    fn test_code_chunk_with_embedding_and_cluster() {
1423        let chunk = CodeChunk {
1424            content: "fn main() {}".to_string(),
1425            start: 0,
1426            end: 12,
1427            embedding: Some(vec![0.5; 384]),
1428            cluster_id: Some(3),
1429        };
1430
1431        assert_eq!(chunk.content, "fn main() {}");
1432        assert_eq!(chunk.start, 0);
1433        assert_eq!(chunk.end, 12);
1434        assert!(chunk.embedding.is_some());
1435        assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1436        assert_eq!(chunk.cluster_id, Some(3));
1437    }
1438
1439    #[test]
1440    fn test_compress_very_long_repetitive() {
1441        let compressor = SemanticCompressor::with_config(SemanticConfig {
1442            budget_ratio: 0.2, // Aggressive compression
1443            ..Default::default()
1444        });
1445
1446        // Very long repetitive content
1447        let content = "repeated_token ".repeat(1000);
1448        let result = compressor.compress(&content).unwrap();
1449
1450        // Should significantly compress
1451        assert!(result.len() < content.len() / 3);
1452        assert!(result.contains("repeated"));
1453    }
1454
1455    #[test]
1456    fn test_semantic_result_type_ok() {
1457        let result: Result<String> = Ok("success".to_string());
1458        assert!(result.is_ok());
1459        assert_eq!(result.unwrap(), "success");
1460    }
1461
1462    #[test]
1463    fn test_semantic_result_type_err() {
1464        let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1465        assert!(result.is_err());
1466    }
1467
1468    // Bug #4 fix tests - budget_ratio effectiveness
1469    #[test]
1470    fn test_find_safe_truncation_point_basic() {
1471        let content = "Hello world this is a test";
1472        let point = find_safe_truncation_point(content, 15);
1473        // Should find a word boundary
1474        assert!(content.is_char_boundary(point));
1475        assert!(point <= 15 || point == content.len());
1476    }
1477
1478    #[test]
1479    fn test_find_safe_truncation_point_newline() {
1480        let content = "Line one\nLine two\nLine three";
1481        let point = find_safe_truncation_point(content, 20);
1482        // Should prefer newline boundary
1483        assert!(content.is_char_boundary(point));
1484    }
1485
1486    #[test]
1487    fn test_find_safe_truncation_point_unicode() {
1488        let content = "Hello ไธ–็•Œ test";
1489        let point = find_safe_truncation_point(content, 10);
1490        // Should not cut in middle of UTF-8 character
1491        assert!(content.is_char_boundary(point));
1492    }
1493
1494    #[test]
1495    fn test_find_safe_truncation_point_beyond_length() {
1496        let content = "short";
1497        let point = find_safe_truncation_point(content, 100);
1498        assert_eq!(point, content.len());
1499    }
1500
1501    #[test]
1502    fn test_budget_ratio_affects_large_content() {
1503        // Test that budget_ratio affects compression of content with paragraph breaks
1504        // This tests the chunk-based compression path
1505        let content = (0..20)
1506            .map(|i| format!("This is paragraph number {} with some content to fill it out nicely.", i))
1507            .collect::<Vec<_>>()
1508            .join("\n\n");
1509
1510        // Test with different budget ratios
1511        let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1512            budget_ratio: 0.3,
1513            min_chunk_size: 20,
1514            max_chunk_size: 2000,
1515            ..Default::default()
1516        });
1517
1518        let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1519            budget_ratio: 0.8,
1520            min_chunk_size: 20,
1521            max_chunk_size: 2000,
1522            ..Default::default()
1523        });
1524
1525        let result_30 = compressor_30.compress(&content).unwrap();
1526        let result_80 = compressor_80.compress(&content).unwrap();
1527
1528        // Lower budget ratio should produce shorter output
1529        assert!(
1530            result_30.len() < result_80.len(),
1531            "30% budget ({}) should be smaller than 80% budget ({})",
1532            result_30.len(),
1533            result_80.len()
1534        );
1535
1536        // Both should indicate compression occurred
1537        assert!(
1538            result_30.contains("compressed") || result_30.len() < content.len(),
1539            "30% should show compression indicator"
1540        );
1541    }
1542
1543    #[test]
1544    fn test_budget_ratio_one_returns_original() {
1545        let content = "Some content without chunk boundaries";
1546
1547        let compressor = SemanticCompressor::with_config(SemanticConfig {
1548            budget_ratio: 1.0, // Keep everything
1549            ..Default::default()
1550        });
1551
1552        let result = compressor.compress(content).unwrap();
1553        // With budget_ratio 1.0, should return original content
1554        assert_eq!(result, content);
1555    }
1556}