infiniloom_engine/
semantic.rs

1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35/// Result type for semantic operations
36pub type Result<T> = std::result::Result<T, SemanticError>;
37
38/// Errors that can occur during semantic operations
39#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41    #[error("Model loading failed: {0}")]
42    ModelLoadError(String),
43
44    #[error("Embedding generation failed: {0}")]
45    EmbeddingError(String),
46
47    #[error("Clustering failed: {0}")]
48    ClusteringError(String),
49
50    #[error("Feature not available: embeddings feature not enabled")]
51    FeatureNotEnabled,
52}
53
54// ============================================================================
55// Semantic Analyzer (for similarity and embeddings)
56// ============================================================================
57
58/// Semantic analyzer using code embeddings
59///
60/// When the `embeddings` feature is enabled, uses the configured model path
61/// for neural network-based embeddings. Without the feature, provides
62/// heuristic-based similarity estimates.
63#[derive(Debug)]
64pub struct SemanticAnalyzer {
65    /// Path to the embedding model (used when embeddings feature is enabled)
66    #[cfg(feature = "embeddings")]
67    model_path: Option<String>,
68    /// Placeholder for non-embeddings build (maintains API compatibility)
69    #[cfg(not(feature = "embeddings"))]
70    _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74    /// Create a new semantic analyzer
75    pub fn new() -> Self {
76        Self {
77            #[cfg(feature = "embeddings")]
78            model_path: None,
79            #[cfg(not(feature = "embeddings"))]
80            _model_path: None,
81        }
82    }
83
84    /// Create a semantic analyzer with a custom model path
85    ///
86    /// The model path is used when the `embeddings` feature is enabled.
87    /// Without the feature, the path is stored but not used.
88    pub fn with_model(model_path: &str) -> Self {
89        Self {
90            #[cfg(feature = "embeddings")]
91            model_path: Some(model_path.to_owned()),
92            #[cfg(not(feature = "embeddings"))]
93            _model_path: Some(model_path.to_owned()),
94        }
95    }
96
97    /// Get the configured model path (if any)
98    #[cfg(feature = "embeddings")]
99    pub fn model_path(&self) -> Option<&str> {
100        self.model_path.as_deref()
101    }
102
103    /// Generate embeddings for code content
104    ///
105    /// # Current Implementation
106    ///
107    /// Uses a character-frequency based embedding algorithm that:
108    /// 1. Creates a 384-dimensional vector (matching common transformer output size)
109    /// 2. Accumulates weighted character frequencies based on position
110    /// 3. Normalizes to unit length for cosine similarity
111    ///
112    /// This is a **lightweight placeholder** that provides reasonable similarity
113    /// estimates for code without requiring ML model dependencies. It captures:
114    /// - Character distribution patterns
115    /// - Position-weighted frequency (earlier chars weighted more)
116    /// - Basic structural patterns through punctuation distribution
117    ///
118    /// For production use cases requiring high accuracy, consider integrating
119    /// actual transformer embeddings.
120    #[cfg(feature = "embeddings")]
121    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122        // Character-frequency based embedding (see doc comment for rationale)
123        let mut embedding = vec![0.0f32; 384];
124        for (i, c) in content.chars().enumerate() {
125            let idx = (c as usize) % 384;
126            // Position-weighted contribution: earlier characters contribute more
127            embedding[idx] += 1.0 / ((i + 1) as f32);
128        }
129        // L2 normalize for cosine similarity
130        let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131        if norm > 0.0 {
132            for x in &mut embedding {
133                *x /= norm;
134            }
135        }
136        Ok(embedding)
137    }
138
139    /// Generate embeddings (stub when feature disabled)
140    #[cfg(not(feature = "embeddings"))]
141    pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142        Ok(vec![0.0; 384])
143    }
144
145    /// Calculate similarity between two code snippets
146    #[cfg(feature = "embeddings")]
147    pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148        let emb_a = self.embed(a)?;
149        let emb_b = self.embed(b)?;
150        Ok(cosine_similarity(&emb_a, &emb_b))
151    }
152
153    /// Calculate similarity (stub when feature disabled)
154    #[cfg(not(feature = "embeddings"))]
155    pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156        Ok(0.0)
157    }
158}
159
160impl Default for SemanticAnalyzer {
161    fn default() -> Self {
162        Self::new()
163    }
164}
165
166// ============================================================================
167// Semantic Compressor (for reducing content while preserving meaning)
168// ============================================================================
169
170/// Configuration for semantic compression
171#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173    /// Similarity threshold for clustering (0.0 - 1.0)
174    pub similarity_threshold: f32,
175    /// Minimum chunk size in characters
176    pub min_chunk_size: usize,
177    /// Maximum chunk size in characters
178    pub max_chunk_size: usize,
179    /// Budget ratio (0.0 - 1.0) - target size relative to original
180    pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184    fn default() -> Self {
185        Self {
186            similarity_threshold: 0.7,
187            min_chunk_size: 100,
188            max_chunk_size: 2000,
189            budget_ratio: 0.5,
190        }
191    }
192}
193
194/// A chunk of code
195#[derive(Debug, Clone)]
196pub struct CodeChunk {
197    /// The original content
198    pub content: String,
199    /// Start offset in original content
200    pub start: usize,
201    /// End offset in original content
202    pub end: usize,
203    /// Embedding vector (when computed)
204    pub embedding: Option<Vec<f32>>,
205    /// Cluster assignment
206    pub cluster_id: Option<usize>,
207}
208
209/// Semantic compressor for code content
210///
211/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
212/// otherwise falls back to heuristic-based compression.
213pub struct SemanticCompressor {
214    config: SemanticConfig,
215    /// Semantic analyzer for generating embeddings and computing similarity
216    analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220    /// Create a new semantic compressor with default config
221    pub fn new() -> Self {
222        Self::with_config(SemanticConfig::default())
223    }
224
225    /// Create a new semantic compressor with custom config
226    pub fn with_config(config: SemanticConfig) -> Self {
227        Self { config, analyzer: SemanticAnalyzer::new() }
228    }
229
230    /// Get a reference to the internal semantic analyzer
231    ///
232    /// This allows access to the analyzer for similarity computations
233    /// or custom embedding operations.
234    pub fn analyzer(&self) -> &SemanticAnalyzer {
235        &self.analyzer
236    }
237
238    /// Compress content semantically
239    ///
240    /// When the `embeddings` feature is enabled, uses neural embeddings
241    /// to cluster similar code chunks and select representatives.
242    ///
243    /// Without the feature, falls back to heuristic-based compression.
244    pub fn compress(&self, content: &str) -> Result<String> {
245        // First, check for repetitive content (Bug #6 fix)
246        if let Some(compressed) = self.compress_repetitive(content) {
247            return Ok(compressed);
248        }
249
250        #[cfg(feature = "embeddings")]
251        {
252            return self.compress_with_embeddings(content);
253        }
254
255        #[cfg(not(feature = "embeddings"))]
256        {
257            self.compress_heuristic(content)
258        }
259    }
260
261    /// Detect and compress repetitive content (Bug #6 fix)
262    ///
263    /// Handles cases like "sentence ".repeat(500) by detecting the repeated pattern
264    /// and returning a compressed representation.
265    ///
266    /// This function is UTF-8 safe - it only slices at valid character boundaries.
267    fn compress_repetitive(&self, content: &str) -> Option<String> {
268        // Only process content above a minimum threshold
269        if content.len() < 200 {
270            return None;
271        }
272
273        // Try to find a repeating pattern
274        // Start with small patterns and work up
275        // We iterate byte positions but only consider those that are valid UTF-8 boundaries
276        for pattern_len in 1..=100.min(content.len() / 3) {
277            // Skip if this byte position is not a valid UTF-8 character boundary
278            if !content.is_char_boundary(pattern_len) {
279                continue;
280            }
281
282            let pattern = &content[..pattern_len];
283
284            // Skip patterns that are just whitespace
285            if pattern.chars().all(|c| c.is_whitespace()) {
286                continue;
287            }
288
289            // Count how many times this pattern repeats consecutively
290            let mut count = 0;
291            let mut pos = 0;
292            while pos + pattern_len <= content.len() {
293                // Ensure both slice boundaries are valid UTF-8
294                if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295                    break;
296                }
297                if &content[pos..pos + pattern_len] == pattern {
298                    count += 1;
299                    pos += pattern_len;
300                } else {
301                    break;
302                }
303            }
304
305            // If pattern repeats enough times and covers most of the content
306            let coverage = (count * pattern_len) as f32 / content.len() as f32;
307            if count >= 3 && coverage >= 0.8 {
308                // Calculate how many instances to keep based on budget_ratio
309                let instances_to_show = (count as f32 * self.config.budget_ratio)
310                    .ceil()
311                    .clamp(1.0, 5.0) as usize;
312
313                let shown_content = pattern.repeat(instances_to_show);
314                // Safe: count * pattern_len is already at a valid boundary (start of next pattern or end)
315                let remainder_start = count * pattern_len;
316                let remainder = if remainder_start <= content.len()
317                    && content.is_char_boundary(remainder_start)
318                {
319                    &content[remainder_start..]
320                } else {
321                    ""
322                };
323
324                let result = if remainder.is_empty() {
325                    format!(
326                        "{}\n/* ... pattern repeated {} times (showing {}) ... */",
327                        shown_content.trim_end(),
328                        count,
329                        instances_to_show
330                    )
331                } else {
332                    format!(
333                        "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
334                        shown_content.trim_end(),
335                        count,
336                        instances_to_show,
337                        remainder.trim()
338                    )
339                };
340
341                return Some(result);
342            }
343        }
344
345        // Also detect line-based repetition (same line repeated many times)
346        let lines: Vec<&str> = content.lines().collect();
347        if lines.len() >= 3 {
348            let mut line_counts: std::collections::HashMap<&str, usize> =
349                std::collections::HashMap::new();
350            for line in &lines {
351                *line_counts.entry(*line).or_insert(0) += 1;
352            }
353
354            // Find the most repeated line
355            if let Some((repeated_line, count)) = line_counts
356                .iter()
357                .filter(|(line, _)| !line.trim().is_empty())
358                .max_by_key(|(_, count)| *count)
359            {
360                let repetition_ratio = *count as f32 / lines.len() as f32;
361                if *count >= 3 && repetition_ratio >= 0.5 {
362                    // Build compressed output preserving unique lines
363                    let mut result = String::new();
364                    let mut consecutive_count = 0;
365                    let mut last_was_repeated = false;
366
367                    for line in &lines {
368                        if *line == *repeated_line {
369                            consecutive_count += 1;
370                            if !last_was_repeated {
371                                if !result.is_empty() {
372                                    result.push('\n');
373                                }
374                                result.push_str(line);
375                            }
376                            last_was_repeated = true;
377                        } else {
378                            if last_was_repeated && consecutive_count > 1 {
379                                result.push_str(&format!(
380                                    "\n/* ... above line repeated {} times ... */",
381                                    consecutive_count
382                                ));
383                            }
384                            consecutive_count = 0;
385                            last_was_repeated = false;
386                            if !result.is_empty() {
387                                result.push('\n');
388                            }
389                            result.push_str(line);
390                        }
391                    }
392
393                    if last_was_repeated && consecutive_count > 1 {
394                        result.push_str(&format!(
395                            "\n/* ... above line repeated {} times ... */",
396                            consecutive_count
397                        ));
398                    }
399
400                    // Only return if we actually compressed significantly
401                    if result.len() < content.len() / 2 {
402                        return Some(result);
403                    }
404                }
405            }
406        }
407
408        None
409    }
410
411    /// Split content into semantic chunks (Bug #6 fix - handles content without \n\n)
412    fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
413        let mut chunks = Vec::new();
414        let mut current_start = 0;
415
416        // First try: Split on double newlines (paragraph-like boundaries)
417        for (i, _) in content.match_indices("\n\n") {
418            if i > current_start && i - current_start >= self.config.min_chunk_size {
419                let chunk_content = &content[current_start..i];
420                if chunk_content.len() <= self.config.max_chunk_size {
421                    chunks.push(CodeChunk {
422                        content: chunk_content.to_owned(),
423                        start: current_start,
424                        end: i,
425                        embedding: None,
426                        cluster_id: None,
427                    });
428                }
429                current_start = i + 2;
430            }
431        }
432
433        // Handle remaining content
434        if current_start < content.len() {
435            let remaining = &content[current_start..];
436            if remaining.len() >= self.config.min_chunk_size {
437                chunks.push(CodeChunk {
438                    content: remaining.to_owned(),
439                    start: current_start,
440                    end: content.len(),
441                    embedding: None,
442                    cluster_id: None,
443                });
444            }
445        }
446
447        // Fallback: If no chunks found (no \n\n separators), try single newlines
448        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
449            current_start = 0;
450            for (i, _) in content.match_indices('\n') {
451                if i > current_start && i - current_start >= self.config.min_chunk_size {
452                    let chunk_content = &content[current_start..i];
453                    if chunk_content.len() <= self.config.max_chunk_size {
454                        chunks.push(CodeChunk {
455                            content: chunk_content.to_owned(),
456                            start: current_start,
457                            end: i,
458                            embedding: None,
459                            cluster_id: None,
460                        });
461                    }
462                    current_start = i + 1;
463                }
464            }
465            // Handle remaining after single newline split
466            if current_start < content.len() {
467                let remaining = &content[current_start..];
468                if remaining.len() >= self.config.min_chunk_size {
469                    chunks.push(CodeChunk {
470                        content: remaining.to_owned(),
471                        start: current_start,
472                        end: content.len(),
473                        embedding: None,
474                        cluster_id: None,
475                    });
476                }
477            }
478        }
479
480        // Second fallback: If still no chunks, split by sentence boundaries (. followed by space)
481        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
482            current_start = 0;
483            for (i, _) in content.match_indices(". ") {
484                if i > current_start && i - current_start >= self.config.min_chunk_size {
485                    let chunk_content = &content[current_start..=i]; // include the period
486                    if chunk_content.len() <= self.config.max_chunk_size {
487                        chunks.push(CodeChunk {
488                            content: chunk_content.to_owned(),
489                            start: current_start,
490                            end: i + 1,
491                            embedding: None,
492                            cluster_id: None,
493                        });
494                    }
495                    current_start = i + 2;
496                }
497            }
498            // Handle remaining
499            if current_start < content.len() {
500                let remaining = &content[current_start..];
501                if remaining.len() >= self.config.min_chunk_size {
502                    chunks.push(CodeChunk {
503                        content: remaining.to_owned(),
504                        start: current_start,
505                        end: content.len(),
506                        embedding: None,
507                        cluster_id: None,
508                    });
509                }
510            }
511        }
512
513        // Final fallback: If content is large but can't be split, force split by max_chunk_size
514        if chunks.is_empty() && content.len() > self.config.max_chunk_size {
515            let mut pos = 0;
516            while pos < content.len() {
517                let end = (pos + self.config.max_chunk_size).min(content.len());
518                chunks.push(CodeChunk {
519                    content: content[pos..end].to_owned(),
520                    start: pos,
521                    end,
522                    embedding: None,
523                    cluster_id: None,
524                });
525                pos = end;
526            }
527        }
528
529        chunks
530    }
531
532    /// Compress using heuristic methods (fallback when embeddings unavailable)
533    ///
534    /// Bug #4 fix: Make budget_ratio more effective for all content types
535    /// Bug fix: Ensure budget_ratio always has an effect when < 1.0
536    fn compress_heuristic(&self, content: &str) -> Result<String> {
537        let chunks = self.split_into_chunks(content);
538
539        // When no chunks can be created, apply character-level truncation based on budget_ratio.
540        // This ensures budget_ratio always has an effect, even for small/unstructured content.
541        if chunks.is_empty() {
542            // Apply truncation if:
543            // 1. budget_ratio < 1.0 (user wants compression)
544            // 2. Content is at least 10 chars (very short content passes through)
545            // 3. The truncation would actually reduce the size
546            if self.config.budget_ratio < 1.0 && content.len() >= 10 {
547                let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
548                if target_len > 0 && target_len < content.len() {
549                    // Find a safe truncation point (word/line boundary)
550                    let truncate_at = find_safe_truncation_point(content, target_len);
551                    if truncate_at < content.len() && truncate_at > 0 {
552                        let truncated = &content[..truncate_at];
553                        return Ok(format!(
554                            "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
555                            truncated.trim_end(),
556                            self.config.budget_ratio * 100.0,
557                            truncate_at,
558                            content.len()
559                        ));
560                    }
561                }
562            }
563            return Ok(content.to_owned());
564        }
565
566        // Special case: If we only have one chunk and budget_ratio < 1.0,
567        // truncate within that chunk instead of keeping it entirely
568        if chunks.len() == 1 && self.config.budget_ratio < 1.0 {
569            let chunk_content = &chunks[0].content;
570            let target_len = (chunk_content.len() as f32 * self.config.budget_ratio) as usize;
571            if target_len > 0 && target_len < chunk_content.len() {
572                let truncate_at = find_safe_truncation_point(chunk_content, target_len);
573                if truncate_at < chunk_content.len() && truncate_at > 0 {
574                    let truncated = &chunk_content[..truncate_at];
575                    return Ok(format!(
576                        "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
577                        truncated.trim_end(),
578                        self.config.budget_ratio * 100.0,
579                        truncate_at,
580                        chunk_content.len()
581                    ));
582                }
583            }
584        }
585
586        // Keep every Nth chunk based on budget ratio
587        let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
588        let step = chunks.len() / target_chunks.max(1);
589
590        let mut result = String::new();
591        let mut kept = 0;
592
593        for (i, chunk) in chunks.iter().enumerate() {
594            if i % step.max(1) == 0 && kept < target_chunks {
595                if !result.is_empty() {
596                    result.push_str("\n\n");
597                }
598                result.push_str(&chunk.content);
599                kept += 1;
600            }
601        }
602
603        // Add truncation marker if we removed content
604        if kept < chunks.len() {
605            result.push_str(&format!(
606                "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
607                chunks.len() - kept,
608                (kept as f32 / chunks.len() as f32) * 100.0
609            ));
610        }
611
612        Ok(result)
613    }
614
615    /// Compress using neural embeddings
616    #[cfg(feature = "embeddings")]
617    fn compress_with_embeddings(&self, content: &str) -> Result<String> {
618        let mut chunks = self.split_into_chunks(content);
619
620        if chunks.is_empty() {
621            return Ok(content.to_owned());
622        }
623
624        // Generate embeddings for each chunk
625        for chunk in &mut chunks {
626            chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
627        }
628
629        // Cluster similar chunks
630        let clusters = self.cluster_chunks(&chunks)?;
631
632        // Select representative from each cluster
633        let mut result = String::new();
634        for cluster in clusters.values() {
635            if let Some(representative) = self.select_representative(cluster) {
636                if !result.is_empty() {
637                    result.push_str("\n\n");
638                }
639                result.push_str(&representative.content);
640            }
641        }
642
643        Ok(result)
644    }
645
646    /// Cluster chunks by embedding similarity
647    #[cfg(feature = "embeddings")]
648    fn cluster_chunks<'a>(
649        &self,
650        chunks: &'a [CodeChunk],
651    ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
652        let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
653        let mut next_cluster = 0;
654
655        for chunk in chunks {
656            let embedding = chunk
657                .embedding
658                .as_ref()
659                .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
660
661            // Find existing cluster with similar embedding
662            let mut target_cluster = None;
663            for (&cluster_id, cluster_chunks) in &clusters {
664                if let Some(first) = cluster_chunks.first() {
665                    if let Some(ref first_emb) = first.embedding {
666                        let similarity = cosine_similarity(embedding, first_emb);
667                        if similarity >= self.config.similarity_threshold {
668                            target_cluster = Some(cluster_id);
669                            break;
670                        }
671                    }
672                }
673            }
674
675            if let Some(cluster_id) = target_cluster {
676                if let Some(cluster) = clusters.get_mut(&cluster_id) {
677                    cluster.push(chunk);
678                }
679            } else {
680                clusters.insert(next_cluster, vec![chunk]);
681                next_cluster += 1;
682            }
683        }
684
685        Ok(clusters)
686    }
687
688    /// Select the best representative from a cluster
689    #[cfg(feature = "embeddings")]
690    fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
691        // Select the longest chunk as representative (most informative)
692        chunks.iter().max_by_key(|c| c.content.len()).copied()
693    }
694}
695
696impl Default for SemanticCompressor {
697    fn default() -> Self {
698        Self::new()
699    }
700}
701
702// ============================================================================
703// Honest Type Aliases
704// ============================================================================
705// The names below more accurately describe the implementation:
706// - "Semantic" implies neural/ML understanding, but we use heuristics
707// - These aliases are provided for clarity and recommended for new code
708
709/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
710///
711/// This analyzer uses character-frequency heuristics for similarity detection,
712/// NOT neural network embeddings. Use this alias when you want to be explicit
713/// about the implementation approach.
714pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
715
716/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
717///
718/// This compressor uses chunk-based heuristics with optional character-frequency
719/// clustering, NOT neural semantic understanding. Use this alias when you want
720/// to be explicit about the implementation approach.
721pub type HeuristicCompressor = SemanticCompressor;
722
723/// Alias for `SemanticConfig` - more honest name.
724pub type HeuristicCompressionConfig = SemanticConfig;
725
726// ============================================================================
727// Utility Functions
728// ============================================================================
729
730/// Find a safe truncation point in content (word or line boundary)
731///
732/// Used by compress_heuristic to ensure we don't cut in the middle of a word
733/// or multi-byte UTF-8 character.
734fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
735    if target_len >= content.len() {
736        return content.len();
737    }
738
739    // First, ensure we're at a valid UTF-8 boundary
740    let mut truncate_at = target_len;
741    while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
742        truncate_at -= 1;
743    }
744
745    // Try to find a line boundary (newline) near the target
746    if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
747        if newline_pos > target_len / 2 {
748            // Found a newline that's not too far back
749            return newline_pos;
750        }
751    }
752
753    // Fall back to word boundary (space)
754    if let Some(space_pos) = content[..truncate_at].rfind(' ') {
755        if space_pos > target_len / 2 {
756            return space_pos;
757        }
758    }
759
760    // No good boundary found, use the UTF-8 safe position
761    truncate_at
762}
763
764/// Compute cosine similarity between two vectors
765///
766/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
767/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
768/// opposite direction.
769///
770/// # Note
771/// This function is used by the embeddings feature for clustering and
772/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
773/// builds without the embeddings feature.
774#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
775fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
776    if a.len() != b.len() || a.is_empty() {
777        return 0.0;
778    }
779
780    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
781    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
782    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
783
784    if norm_a == 0.0 || norm_b == 0.0 {
785        return 0.0;
786    }
787
788    dot / (norm_a * norm_b)
789}
790
791// ============================================================================
792// Tests
793// ============================================================================
794
795#[cfg(test)]
796mod tests {
797    use super::*;
798
799    #[test]
800    fn test_analyzer_creation() {
801        let analyzer = SemanticAnalyzer::new();
802        // Verify analyzer is created successfully
803        // Model path is None by default (accessed via model_path() when embeddings enabled)
804        #[cfg(feature = "embeddings")]
805        assert!(analyzer.model_path().is_none());
806        #[cfg(not(feature = "embeddings"))]
807        drop(analyzer); // Explicitly drop to satisfy lint
808    }
809
810    #[test]
811    fn test_analyzer_with_model() {
812        let analyzer = SemanticAnalyzer::with_model("/path/to/model");
813        #[cfg(feature = "embeddings")]
814        assert_eq!(analyzer.model_path(), Some("/path/to/model"));
815        #[cfg(not(feature = "embeddings"))]
816        drop(analyzer); // Explicitly drop to satisfy lint
817    }
818
819    #[test]
820    fn test_compressor_analyzer_access() {
821        let compressor = SemanticCompressor::new();
822        // Verify we can access the analyzer through the compressor
823        let _analyzer = compressor.analyzer();
824    }
825
826    #[test]
827    fn test_semantic_config_default() {
828        let config = SemanticConfig::default();
829        assert_eq!(config.similarity_threshold, 0.7);
830        assert_eq!(config.budget_ratio, 0.5);
831    }
832
833    #[test]
834    fn test_split_into_chunks() {
835        let compressor = SemanticCompressor::with_config(SemanticConfig {
836            min_chunk_size: 10,
837            max_chunk_size: 1000,
838            ..Default::default()
839        });
840
841        let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
842        let chunks = compressor.split_into_chunks(content);
843        assert!(chunks.len() >= 2);
844    }
845
846    #[test]
847    fn test_heuristic_compression() {
848        let compressor = SemanticCompressor::with_config(SemanticConfig {
849            min_chunk_size: 5,
850            max_chunk_size: 100,
851            budget_ratio: 0.5,
852            ..Default::default()
853        });
854
855        let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
856        let result = compressor.compress_heuristic(content).unwrap();
857        // Should complete without error
858        assert!(!result.is_empty() || content.is_empty());
859    }
860
861    #[test]
862    fn test_empty_content() {
863        let compressor = SemanticCompressor::new();
864        let result = compressor.compress("").unwrap();
865        assert_eq!(result, "");
866    }
867
868    #[test]
869    fn test_cosine_similarity_identical() {
870        let a = vec![1.0, 0.0, 0.0];
871        let b = vec![1.0, 0.0, 0.0];
872        let sim = cosine_similarity(&a, &b);
873        assert!((sim - 1.0).abs() < 0.001);
874    }
875
876    #[test]
877    fn test_cosine_similarity_orthogonal() {
878        let a = vec![1.0, 0.0, 0.0];
879        let c = vec![0.0, 1.0, 0.0];
880        let sim = cosine_similarity(&a, &c);
881        assert!(sim.abs() < 0.001);
882    }
883
884    #[test]
885    fn test_cosine_similarity_empty() {
886        let a: Vec<f32> = vec![];
887        let b: Vec<f32> = vec![];
888        assert_eq!(cosine_similarity(&a, &b), 0.0);
889    }
890
891    // Bug #6 tests - repetitive content compression
892    #[test]
893    fn test_repetitive_pattern_compression() {
894        let compressor = SemanticCompressor::new();
895        // Test "sentence ".repeat(500) - exactly the reported bug case
896        let content = "sentence ".repeat(500);
897        let result = compressor.compress(&content).unwrap();
898
899        // Result should be significantly smaller than original
900        assert!(
901            result.len() < content.len() / 2,
902            "Compressed size {} should be less than half of original {}",
903            result.len(),
904            content.len()
905        );
906
907        // Should contain the pattern and a compression marker
908        assert!(result.contains("sentence"));
909        assert!(
910            result.contains("repeated") || result.contains("pattern"),
911            "Should indicate compression occurred"
912        );
913    }
914
915    #[test]
916    fn test_repetitive_line_compression() {
917        let compressor = SemanticCompressor::new();
918        // Test repeated lines
919        let content = "same line\n".repeat(100);
920        let result = compressor.compress(&content).unwrap();
921
922        // Result should be significantly smaller
923        assert!(
924            result.len() < content.len() / 2,
925            "Compressed size {} should be less than half of original {}",
926            result.len(),
927            content.len()
928        );
929    }
930
931    #[test]
932    fn test_non_repetitive_content_unchanged() {
933        // Use budget_ratio=1.0 to preserve content (default is 0.5 which truncates)
934        let compressor = SemanticCompressor::with_config(SemanticConfig {
935            budget_ratio: 1.0,
936            ..Default::default()
937        });
938        // Non-repetitive content should not trigger repetition compression
939        let content = "This is some unique content that does not repeat.";
940        let result = compressor.compress(content).unwrap();
941
942        // Short non-repetitive content should be returned as-is with budget_ratio=1.0
943        assert_eq!(result, content);
944    }
945
946    #[test]
947    fn test_repetitive_with_variation() {
948        let compressor = SemanticCompressor::with_config(SemanticConfig {
949            budget_ratio: 0.3,
950            ..Default::default()
951        });
952
953        // Content with some repetition mixed with unique parts
954        let mut content = String::new();
955        for i in 0..50 {
956            content.push_str(&format!("item {} ", i % 5)); // Repeated pattern with variation
957        }
958
959        let result = compressor.compress(&content).unwrap();
960        // This may or may not compress depending on pattern detection
961        // Just verify it doesn't panic
962        assert!(!result.is_empty());
963    }
964
965    // UTF-8 boundary safety tests for compress_repetitive
966    #[test]
967    fn test_repetitive_unicode_chinese() {
968        let compressor = SemanticCompressor::new();
969        // Chinese characters are 3 bytes each
970        // Create repeating Chinese pattern
971        let content = "中文测试 ".repeat(100); // Each repeat is 13 bytes
972        let result = compressor.compress(&content).unwrap();
973
974        // Should not panic and should produce valid UTF-8
975        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
976
977        // Should compress or return unchanged (not panic)
978        assert!(!result.is_empty() || content.is_empty());
979    }
980
981    #[test]
982    fn test_repetitive_unicode_emoji() {
983        let compressor = SemanticCompressor::new();
984        // Emoji are 4 bytes each
985        let content = "🎉🎊🎁 ".repeat(80); // Each repeat is 14 bytes
986
987        let result = compressor.compress(&content).unwrap();
988        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
989        assert!(!result.is_empty() || content.is_empty());
990    }
991
992    #[test]
993    fn test_repetitive_unicode_mixed() {
994        let compressor = SemanticCompressor::new();
995        // Mix of 1, 2, 3, and 4 byte characters
996        let content = "a中🎉 ".repeat(60); // Each repeat is 11 bytes
997
998        let result = compressor.compress(&content).unwrap();
999        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1000        assert!(!result.is_empty() || content.is_empty());
1001    }
1002
1003    #[test]
1004    fn test_repetitive_unicode_cyrillic() {
1005        let compressor = SemanticCompressor::new();
1006        // Cyrillic characters are 2 bytes each
1007        let content = "Привет ".repeat(50);
1008
1009        let result = compressor.compress(&content).unwrap();
1010        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1011    }
1012
1013    #[test]
1014    fn test_non_repetitive_unicode_boundary() {
1015        let compressor = SemanticCompressor::new();
1016        // Content where pattern detection would try various byte lengths
1017        // that don't align with UTF-8 boundaries
1018        let content = "世界和平".repeat(60); // No spaces, pure multi-byte
1019
1020        let result = compressor.compress(&content).unwrap();
1021        // Should not panic even when pattern length iteration
1022        // hits non-UTF-8 boundaries
1023        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1024    }
1025
1026    #[test]
1027    fn test_repetitive_unicode_line_based() {
1028        let compressor = SemanticCompressor::new();
1029        // Test line-based repetition detection with Unicode
1030        let content = "中文行\n".repeat(100);
1031
1032        let result = compressor.compress(&content).unwrap();
1033        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1034    }
1035
1036    // ==========================================================================
1037    // Additional coverage tests
1038    // ==========================================================================
1039
1040    #[test]
1041    fn test_semantic_error_display() {
1042        let err1 = SemanticError::ModelLoadError("test error".to_string());
1043        assert!(err1.to_string().contains("Model loading failed"));
1044        assert!(err1.to_string().contains("test error"));
1045
1046        let err2 = SemanticError::EmbeddingError("embed fail".to_string());
1047        assert!(err2.to_string().contains("Embedding generation failed"));
1048
1049        let err3 = SemanticError::ClusteringError("cluster fail".to_string());
1050        assert!(err3.to_string().contains("Clustering failed"));
1051
1052        let err4 = SemanticError::FeatureNotEnabled;
1053        assert!(err4.to_string().contains("embeddings feature not enabled"));
1054    }
1055
1056    #[test]
1057    fn test_semantic_error_debug() {
1058        let err = SemanticError::ModelLoadError("debug test".to_string());
1059        let debug_str = format!("{:?}", err);
1060        assert!(debug_str.contains("ModelLoadError"));
1061    }
1062
1063    #[test]
1064    fn test_semantic_analyzer_default() {
1065        let analyzer = SemanticAnalyzer::default();
1066        // Should work same as new()
1067        let result = analyzer.embed("test");
1068        assert!(result.is_ok());
1069    }
1070
1071    #[test]
1072    fn test_semantic_analyzer_debug() {
1073        let analyzer = SemanticAnalyzer::new();
1074        let debug_str = format!("{:?}", analyzer);
1075        assert!(debug_str.contains("SemanticAnalyzer"));
1076    }
1077
1078    #[test]
1079    fn test_semantic_analyzer_embed_empty() {
1080        let analyzer = SemanticAnalyzer::new();
1081        let result = analyzer.embed("").unwrap();
1082        assert_eq!(result.len(), 384);
1083    }
1084
1085    #[test]
1086    fn test_semantic_analyzer_embed_produces_384_dims() {
1087        let analyzer = SemanticAnalyzer::new();
1088        let result = analyzer.embed("some code content").unwrap();
1089        assert_eq!(result.len(), 384);
1090    }
1091
1092    #[test]
1093    fn test_semantic_analyzer_similarity_same_content() {
1094        let analyzer = SemanticAnalyzer::new();
1095        let result = analyzer.similarity("hello world", "hello world").unwrap();
1096        // Same content should have high similarity (1.0 in embeddings mode, 0.0 in fallback)
1097        #[cfg(feature = "embeddings")]
1098        assert!((result - 1.0).abs() < 0.01);
1099        #[cfg(not(feature = "embeddings"))]
1100        assert_eq!(result, 0.0);
1101    }
1102
1103    #[test]
1104    fn test_semantic_analyzer_similarity_different_content() {
1105        let analyzer = SemanticAnalyzer::new();
1106        let result = analyzer.similarity("hello", "goodbye").unwrap();
1107        // Result should be valid (0.0 in fallback mode)
1108        #[cfg(not(feature = "embeddings"))]
1109        assert_eq!(result, 0.0);
1110        #[cfg(feature = "embeddings")]
1111        assert!(result >= -1.0 && result <= 1.0);
1112    }
1113
1114    #[test]
1115    fn test_semantic_config_custom() {
1116        let config = SemanticConfig {
1117            similarity_threshold: 0.9,
1118            min_chunk_size: 50,
1119            max_chunk_size: 5000,
1120            budget_ratio: 0.3,
1121        };
1122        assert_eq!(config.similarity_threshold, 0.9);
1123        assert_eq!(config.min_chunk_size, 50);
1124        assert_eq!(config.max_chunk_size, 5000);
1125        assert_eq!(config.budget_ratio, 0.3);
1126    }
1127
1128    #[test]
1129    fn test_semantic_config_clone() {
1130        let config = SemanticConfig::default();
1131        let cloned = config.clone();
1132        assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1133        assert_eq!(cloned.budget_ratio, config.budget_ratio);
1134    }
1135
1136    #[test]
1137    fn test_semantic_config_debug() {
1138        let config = SemanticConfig::default();
1139        let debug_str = format!("{:?}", config);
1140        assert!(debug_str.contains("SemanticConfig"));
1141        assert!(debug_str.contains("similarity_threshold"));
1142    }
1143
1144    #[test]
1145    fn test_code_chunk_debug() {
1146        let chunk = CodeChunk {
1147            content: "test content".to_string(),
1148            start: 0,
1149            end: 12,
1150            embedding: None,
1151            cluster_id: None,
1152        };
1153        let debug_str = format!("{:?}", chunk);
1154        assert!(debug_str.contains("CodeChunk"));
1155        assert!(debug_str.contains("test content"));
1156    }
1157
1158    #[test]
1159    fn test_code_chunk_clone() {
1160        let chunk = CodeChunk {
1161            content: "original".to_string(),
1162            start: 0,
1163            end: 8,
1164            embedding: Some(vec![0.1, 0.2, 0.3]),
1165            cluster_id: Some(5),
1166        };
1167        let cloned = chunk.clone();
1168        assert_eq!(cloned.content, "original");
1169        assert_eq!(cloned.start, 0);
1170        assert_eq!(cloned.end, 8);
1171        assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1172        assert_eq!(cloned.cluster_id, Some(5));
1173    }
1174
1175    #[test]
1176    fn test_semantic_compressor_default() {
1177        let compressor = SemanticCompressor::default();
1178        let result = compressor.compress("test").unwrap();
1179        assert_eq!(result, "test");
1180    }
1181
1182    #[test]
1183    fn test_split_into_chunks_single_newline_fallback() {
1184        let compressor = SemanticCompressor::with_config(SemanticConfig {
1185            min_chunk_size: 5,
1186            max_chunk_size: 1000,
1187            ..Default::default()
1188        });
1189
1190        // Content with only single newlines (no \n\n)
1191        let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1192        let chunks = compressor.split_into_chunks(content);
1193        // Should use single newline fallback
1194        assert!(!chunks.is_empty() || content.len() < 5);
1195    }
1196
1197    #[test]
1198    fn test_split_into_chunks_sentence_fallback() {
1199        let compressor = SemanticCompressor::with_config(SemanticConfig {
1200            min_chunk_size: 10,
1201            max_chunk_size: 1000,
1202            ..Default::default()
1203        });
1204
1205        // Content with sentences but no newlines
1206        let content = "First sentence here. Second sentence here. Third sentence here.";
1207        let chunks = compressor.split_into_chunks(content);
1208        // Should use sentence boundary fallback
1209        assert!(!chunks.is_empty() || content.len() < 10);
1210    }
1211
1212    #[test]
1213    fn test_split_into_chunks_force_split() {
1214        let compressor = SemanticCompressor::with_config(SemanticConfig {
1215            min_chunk_size: 100, // Higher than content length so normal chunking fails
1216            max_chunk_size: 20,  // Lower than content length to trigger force split
1217            ..Default::default()
1218        });
1219
1220        // Content without any splitting characters, longer than max_chunk_size
1221        // but shorter than min_chunk_size (so normal chunking produces empty result)
1222        let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1223        let chunks = compressor.split_into_chunks(content);
1224        // Should force split by max_chunk_size when no other splitting works
1225        assert!(
1226            chunks.len() >= 2,
1227            "Expected at least 2 chunks from force split, got {}",
1228            chunks.len()
1229        );
1230    }
1231
1232    #[test]
1233    fn test_split_into_chunks_empty() {
1234        let compressor = SemanticCompressor::new();
1235        let chunks = compressor.split_into_chunks("");
1236        assert!(chunks.is_empty());
1237    }
1238
1239    #[test]
1240    fn test_split_into_chunks_below_min_size() {
1241        let compressor = SemanticCompressor::with_config(SemanticConfig {
1242            min_chunk_size: 100,
1243            max_chunk_size: 1000,
1244            ..Default::default()
1245        });
1246
1247        let content = "short";
1248        let chunks = compressor.split_into_chunks(content);
1249        // Content too short for min_chunk_size
1250        assert!(chunks.is_empty());
1251    }
1252
1253    #[test]
1254    fn test_compress_heuristic_empty_chunks() {
1255        let compressor = SemanticCompressor::with_config(SemanticConfig {
1256            min_chunk_size: 1000, // Force no chunks to be created
1257            budget_ratio: 1.0,    // Use 1.0 to preserve content unchanged
1258            ..Default::default()
1259        });
1260
1261        let content = "short content";
1262        let result = compressor.compress_heuristic(content).unwrap();
1263        // Should return original when no chunks created and budget_ratio=1.0
1264        assert_eq!(result, content);
1265    }
1266
1267    #[test]
1268    fn test_compress_heuristic_multiple_chunks() {
1269        let compressor = SemanticCompressor::with_config(SemanticConfig {
1270            min_chunk_size: 10,
1271            max_chunk_size: 100,
1272            budget_ratio: 0.3,
1273            ..Default::default()
1274        });
1275
1276        let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1277        let result = compressor.compress_heuristic(content).unwrap();
1278        // Should have compression marker if chunks were removed
1279        assert!(result.contains("chunk") || result.contains("compressed"));
1280    }
1281
1282    #[test]
1283    fn test_cosine_similarity_different_lengths() {
1284        let a = vec![1.0, 2.0, 3.0];
1285        let b = vec![1.0, 2.0];
1286        let sim = cosine_similarity(&a, &b);
1287        assert_eq!(sim, 0.0); // Different lengths should return 0
1288    }
1289
1290    #[test]
1291    fn test_cosine_similarity_zero_vectors() {
1292        let a = vec![0.0, 0.0, 0.0];
1293        let b = vec![1.0, 2.0, 3.0];
1294        let sim = cosine_similarity(&a, &b);
1295        assert_eq!(sim, 0.0); // Zero norm should return 0
1296    }
1297
1298    #[test]
1299    fn test_cosine_similarity_opposite() {
1300        let a = vec![1.0, 0.0, 0.0];
1301        let b = vec![-1.0, 0.0, 0.0];
1302        let sim = cosine_similarity(&a, &b);
1303        assert!((sim + 1.0).abs() < 0.001); // Opposite directions = -1.0
1304    }
1305
1306    #[test]
1307    fn test_cosine_similarity_normalized() {
1308        let a = vec![0.6, 0.8, 0.0];
1309        let b = vec![0.6, 0.8, 0.0];
1310        let sim = cosine_similarity(&a, &b);
1311        assert!((sim - 1.0).abs() < 0.001);
1312    }
1313
1314    #[test]
1315    fn test_compress_repetitive_short_content() {
1316        let compressor = SemanticCompressor::new();
1317        // Content below 200 chars should not trigger repetition compression
1318        let content = "short ".repeat(10); // 60 chars
1319        let result = compressor.compress_repetitive(&content);
1320        assert!(result.is_none());
1321    }
1322
1323    #[test]
1324    fn test_compress_repetitive_whitespace_only() {
1325        let compressor = SemanticCompressor::new();
1326        // Whitespace-only patterns should be skipped
1327        let content = "   ".repeat(100);
1328        let result = compressor.compress_repetitive(&content);
1329        // Should not compress whitespace-only patterns
1330        assert!(result.is_none());
1331    }
1332
1333    #[test]
1334    fn test_compress_repetitive_low_coverage() {
1335        let compressor = SemanticCompressor::new();
1336        // Pattern that doesn't cover 80% of content
1337        let mut content = "pattern ".repeat(5);
1338        content.push_str(&"x".repeat(200)); // Add non-repeating content
1339        let result = compressor.compress_repetitive(&content);
1340        // Low coverage should not trigger compression
1341        assert!(result.is_none());
1342    }
1343
1344    #[test]
1345    fn test_compress_repetitive_line_low_ratio() {
1346        let compressor = SemanticCompressor::new();
1347        // Lines where no single line repeats enough
1348        let content = (0..20)
1349            .map(|i| format!("unique line {}", i))
1350            .collect::<Vec<_>>()
1351            .join("\n");
1352        let result = compressor.compress_repetitive(&content);
1353        // No significant repetition
1354        assert!(result.is_none());
1355    }
1356
1357    #[test]
1358    fn test_compress_repetitive_mixed_with_unique() {
1359        let compressor = SemanticCompressor::new();
1360        // Repeated line mixed with unique lines
1361        let mut lines = vec![];
1362        for i in 0..50 {
1363            if i % 2 == 0 {
1364                lines.push("repeated line");
1365            } else {
1366                lines.push("unique line");
1367            }
1368        }
1369        let content = lines.join("\n");
1370        let result = compressor.compress(&content).unwrap();
1371        // Should handle mixed content
1372        assert!(!result.is_empty());
1373    }
1374
1375    #[test]
1376    fn test_compress_no_repetition_returns_none() {
1377        let compressor = SemanticCompressor::new();
1378        // Unique content that doesn't repeat
1379        let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1380        // Each sentence is unique enough
1381        let result = compressor.compress_repetitive(&content);
1382        // Depends on pattern length detection - may or may not find pattern
1383        // Just verify no panic
1384        drop(result);
1385    }
1386
1387    #[test]
1388    fn test_type_aliases() {
1389        // Test that type aliases work correctly
1390        let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1391        let _compressor: HeuristicCompressor = SemanticCompressor::new();
1392        let _config: HeuristicCompressionConfig = SemanticConfig::default();
1393    }
1394
1395    #[test]
1396    fn test_compress_preserves_content_structure() {
1397        let compressor = SemanticCompressor::with_config(SemanticConfig {
1398            min_chunk_size: 10,
1399            max_chunk_size: 500,
1400            budget_ratio: 1.0, // Keep everything
1401            ..Default::default()
1402        });
1403
1404        let content = "def foo():\n    pass\n\ndef bar():\n    pass";
1405        let result = compressor.compress(content).unwrap();
1406        // With budget_ratio 1.0, should keep most content
1407        assert!(result.contains("foo") || result.contains("bar"));
1408    }
1409
1410    #[test]
1411    fn test_split_chunks_respects_max_size() {
1412        let compressor = SemanticCompressor::with_config(SemanticConfig {
1413            min_chunk_size: 5,
1414            max_chunk_size: 50,
1415            ..Default::default()
1416        });
1417
1418        let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1419        let chunks = compressor.split_into_chunks(content);
1420
1421        for chunk in &chunks {
1422            assert!(chunk.content.len() <= 50, "Chunk size {} exceeds max 50", chunk.content.len());
1423        }
1424    }
1425
1426    #[test]
1427    fn test_compress_repetitive_with_remainder() {
1428        let compressor = SemanticCompressor::new();
1429        // Pattern that repeats but has a small remainder
1430        let mut content = "abc ".repeat(100);
1431        content.push_str("xyz"); // Add non-repeating remainder
1432
1433        let result = compressor.compress(&content).unwrap();
1434        // Should compress and handle remainder
1435        assert!(!result.is_empty());
1436    }
1437
1438    #[test]
1439    fn test_compressor_analyzer_method() {
1440        let compressor = SemanticCompressor::new();
1441        let analyzer = compressor.analyzer();
1442
1443        // Verify the analyzer works
1444        let embed_result = analyzer.embed("test code");
1445        assert!(embed_result.is_ok());
1446    }
1447
1448    #[test]
1449    fn test_code_chunk_with_embedding_and_cluster() {
1450        let chunk = CodeChunk {
1451            content: "fn main() {}".to_string(),
1452            start: 0,
1453            end: 12,
1454            embedding: Some(vec![0.5; 384]),
1455            cluster_id: Some(3),
1456        };
1457
1458        assert_eq!(chunk.content, "fn main() {}");
1459        assert_eq!(chunk.start, 0);
1460        assert_eq!(chunk.end, 12);
1461        assert!(chunk.embedding.is_some());
1462        assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1463        assert_eq!(chunk.cluster_id, Some(3));
1464    }
1465
1466    #[test]
1467    fn test_compress_very_long_repetitive() {
1468        let compressor = SemanticCompressor::with_config(SemanticConfig {
1469            budget_ratio: 0.2, // Aggressive compression
1470            ..Default::default()
1471        });
1472
1473        // Very long repetitive content
1474        let content = "repeated_token ".repeat(1000);
1475        let result = compressor.compress(&content).unwrap();
1476
1477        // Should significantly compress
1478        assert!(result.len() < content.len() / 3);
1479        assert!(result.contains("repeated"));
1480    }
1481
1482    #[test]
1483    fn test_semantic_result_type_ok() {
1484        let result: Result<String> = Ok("success".to_string());
1485        assert!(result.is_ok());
1486        assert_eq!(result.unwrap(), "success");
1487    }
1488
1489    #[test]
1490    fn test_semantic_result_type_err() {
1491        let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1492        assert!(result.is_err());
1493    }
1494
1495    // Bug #4 fix tests - budget_ratio effectiveness
1496    #[test]
1497    fn test_find_safe_truncation_point_basic() {
1498        let content = "Hello world this is a test";
1499        let point = find_safe_truncation_point(content, 15);
1500        // Should find a word boundary
1501        assert!(content.is_char_boundary(point));
1502        assert!(point <= 15 || point == content.len());
1503    }
1504
1505    #[test]
1506    fn test_find_safe_truncation_point_newline() {
1507        let content = "Line one\nLine two\nLine three";
1508        let point = find_safe_truncation_point(content, 20);
1509        // Should prefer newline boundary
1510        assert!(content.is_char_boundary(point));
1511    }
1512
1513    #[test]
1514    fn test_find_safe_truncation_point_unicode() {
1515        let content = "Hello 世界 test";
1516        let point = find_safe_truncation_point(content, 10);
1517        // Should not cut in middle of UTF-8 character
1518        assert!(content.is_char_boundary(point));
1519    }
1520
1521    #[test]
1522    fn test_find_safe_truncation_point_beyond_length() {
1523        let content = "short";
1524        let point = find_safe_truncation_point(content, 100);
1525        assert_eq!(point, content.len());
1526    }
1527
1528    #[test]
1529    fn test_budget_ratio_affects_large_content() {
1530        // Test that budget_ratio affects compression of content with paragraph breaks
1531        // This tests the chunk-based compression path
1532        let content = (0..20)
1533            .map(|i| {
1534                format!("This is paragraph number {} with some content to fill it out nicely.", i)
1535            })
1536            .collect::<Vec<_>>()
1537            .join("\n\n");
1538
1539        // Test with different budget ratios
1540        let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1541            budget_ratio: 0.3,
1542            min_chunk_size: 20,
1543            max_chunk_size: 2000,
1544            ..Default::default()
1545        });
1546
1547        let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1548            budget_ratio: 0.8,
1549            min_chunk_size: 20,
1550            max_chunk_size: 2000,
1551            ..Default::default()
1552        });
1553
1554        let result_30 = compressor_30.compress(&content).unwrap();
1555        let result_80 = compressor_80.compress(&content).unwrap();
1556
1557        // Lower budget ratio should produce shorter output
1558        assert!(
1559            result_30.len() < result_80.len(),
1560            "30% budget ({}) should be smaller than 80% budget ({})",
1561            result_30.len(),
1562            result_80.len()
1563        );
1564
1565        // Both should indicate compression occurred
1566        assert!(
1567            result_30.contains("compressed") || result_30.len() < content.len(),
1568            "30% should show compression indicator"
1569        );
1570    }
1571
1572    #[test]
1573    fn test_budget_ratio_one_returns_original() {
1574        let content = "Some content without chunk boundaries";
1575
1576        let compressor = SemanticCompressor::with_config(SemanticConfig {
1577            budget_ratio: 1.0, // Keep everything
1578            ..Default::default()
1579        });
1580
1581        let result = compressor.compress(content).unwrap();
1582        // With budget_ratio 1.0, should return original content
1583        assert_eq!(result, content);
1584    }
1585
1586    // ==========================================================================
1587    // Bug #4 Fix Tests - budget_ratio effectiveness for small content
1588    // ==========================================================================
1589
1590    /// Test that budget_ratio affects content >= 10 chars even without chunk boundaries.
1591    /// This was the bug: small content wasn't being truncated because the threshold
1592    /// was set to min_chunk_size (100) instead of a lower value (10).
1593    #[test]
1594    fn test_budget_ratio_affects_small_content() {
1595        // Content that's over 10 chars but has no chunk boundaries
1596        // Previously this wouldn't be compressed because it was under min_chunk_size
1597        let content = "This is a short test string that should be affected by budget ratio.";
1598
1599        let compressor = SemanticCompressor::with_config(SemanticConfig {
1600            budget_ratio: 0.3, // Keep only 30%
1601            min_chunk_size: 100,
1602            max_chunk_size: 2000,
1603            ..Default::default()
1604        });
1605
1606        let result = compressor.compress(content).unwrap();
1607
1608        // With budget_ratio 0.3, content should be truncated
1609        assert!(
1610            result.len() < content.len() || result.contains("truncated"),
1611            "Small content with budget_ratio=0.3 should be compressed. Original: {}, Result: {}",
1612            content.len(),
1613            result.len()
1614        );
1615    }
1616
1617    /// Test that budget_ratio 1.0 preserves small content
1618    #[test]
1619    fn test_budget_ratio_one_preserves_small_content() {
1620        let content = "Short content that should remain unchanged with budget_ratio=1.0";
1621
1622        let compressor = SemanticCompressor::with_config(SemanticConfig {
1623            budget_ratio: 1.0,
1624            min_chunk_size: 100,
1625            max_chunk_size: 2000,
1626            ..Default::default()
1627        });
1628
1629        let result = compressor.compress(content).unwrap();
1630
1631        // With budget_ratio 1.0, should return original
1632        assert_eq!(result, content, "budget_ratio=1.0 should preserve content");
1633    }
1634
1635    /// Test that very short content (< 10 chars) passes through unchanged
1636    #[test]
1637    fn test_very_short_content_unchanged() {
1638        let content = "tiny";
1639
1640        let compressor = SemanticCompressor::with_config(SemanticConfig {
1641            budget_ratio: 0.1, // Even aggressive budget shouldn't affect very short content
1642            ..Default::default()
1643        });
1644
1645        let result = compressor.compress(content).unwrap();
1646
1647        // Very short content should pass through
1648        assert_eq!(result, content, "Very short content should be unchanged");
1649    }
1650
1651    /// Test that budget_ratio affects medium content without chunk boundaries
1652    #[test]
1653    fn test_budget_ratio_medium_no_chunks() {
1654        // Content that's long enough to compress but has no paragraph breaks
1655        let content = "This is a medium length test content that has no paragraph breaks and should trigger the budget ratio truncation path because there are no chunk boundaries.";
1656
1657        let compressor = SemanticCompressor::with_config(SemanticConfig {
1658            budget_ratio: 0.5,
1659            min_chunk_size: 200, // Higher than content length
1660            max_chunk_size: 2000,
1661            ..Default::default()
1662        });
1663
1664        let result = compressor.compress(content).unwrap();
1665
1666        // Should be compressed to ~50%
1667        assert!(
1668            result.len() < content.len(),
1669            "Medium content with budget_ratio=0.5 should be compressed. Original: {}, Result: {}",
1670            content.len(),
1671            result.len()
1672        );
1673    }
1674
1675    /// Test that truncation marker includes percentage and char counts
1676    #[test]
1677    fn test_truncation_marker_format() {
1678        let content = "A sufficiently long piece of content that will definitely be truncated when we set a low budget ratio.";
1679
1680        let compressor = SemanticCompressor::with_config(SemanticConfig {
1681            budget_ratio: 0.3,
1682            min_chunk_size: 200,
1683            max_chunk_size: 2000,
1684            ..Default::default()
1685        });
1686
1687        let result = compressor.compress(content).unwrap();
1688
1689        // Should contain truncation marker with useful info
1690        if result.contains("truncated") {
1691            assert!(result.contains("%"), "Truncation marker should include percentage");
1692            assert!(result.contains("chars"), "Truncation marker should include char count");
1693        }
1694    }
1695
1696    /// Test different budget ratios produce proportionally different outputs
1697    #[test]
1698    fn test_budget_ratio_proportional() {
1699        let content = "This content is long enough to test different budget ratio values and see that they produce outputs of proportionally different sizes as expected.";
1700
1701        let compressor_20 = SemanticCompressor::with_config(SemanticConfig {
1702            budget_ratio: 0.2,
1703            min_chunk_size: 200,
1704            ..Default::default()
1705        });
1706
1707        let compressor_50 = SemanticCompressor::with_config(SemanticConfig {
1708            budget_ratio: 0.5,
1709            min_chunk_size: 200,
1710            ..Default::default()
1711        });
1712
1713        let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1714            budget_ratio: 0.8,
1715            min_chunk_size: 200,
1716            ..Default::default()
1717        });
1718
1719        let result_20 = compressor_20.compress(content).unwrap();
1720        let result_50 = compressor_50.compress(content).unwrap();
1721        let result_80 = compressor_80.compress(content).unwrap();
1722
1723        // Lower ratio should produce shorter output
1724        assert!(
1725            result_20.len() <= result_50.len(),
1726            "20% ratio ({}) should be <= 50% ratio ({})",
1727            result_20.len(),
1728            result_50.len()
1729        );
1730        assert!(
1731            result_50.len() <= result_80.len(),
1732            "50% ratio ({}) should be <= 80% ratio ({})",
1733            result_50.len(),
1734            result_80.len()
1735        );
1736    }
1737}