infiniloom_engine/
semantic.rs

1//! Semantic analysis and compression module
2//!
3//! This module provides semantic code understanding through embeddings,
4//! enabling similarity search and intelligent code compression.
5//!
6//! # Feature: `embeddings`
7//!
8//! When the `embeddings` feature is enabled, this module provides:
9//! - Embedding generation for code content (currently uses character-frequency heuristics)
10//! - Cosine similarity computation between code snippets
11//! - Clustering-based compression that groups similar code chunks
12//!
13//! ## Current Implementation Status
14//!
15//! **Important**: The current embeddings implementation uses a simple character-frequency
16//! based algorithm, NOT neural network embeddings. This is a lightweight placeholder that
17//! provides reasonable results for basic similarity detection without requiring external
18//! model dependencies.
19//!
20//! Future versions may integrate actual transformer-based embeddings via:
21//! - Candle (Rust-native ML framework)
22//! - ONNX Runtime for pre-trained models
23//! - External embedding services (OpenAI, Cohere, etc.)
24//!
25//! ## Without `embeddings` Feature
26//!
27//! Falls back to heuristic-based compression that:
28//! - Splits content at paragraph boundaries
29//! - Keeps every Nth chunk based on budget ratio
30//! - No similarity computation (all operations return 0.0)
31
32use std::collections::HashMap;
33
34/// Result type for semantic operations
35pub type Result<T> = std::result::Result<T, SemanticError>;
36
37/// Errors that can occur during semantic operations
38#[derive(Debug, thiserror::Error)]
39pub enum SemanticError {
40    #[error("Model loading failed: {0}")]
41    ModelLoadError(String),
42
43    #[error("Embedding generation failed: {0}")]
44    EmbeddingError(String),
45
46    #[error("Clustering failed: {0}")]
47    ClusteringError(String),
48
49    #[error("Feature not available: embeddings feature not enabled")]
50    FeatureNotEnabled,
51}
52
53// ============================================================================
54// Semantic Analyzer (for similarity and embeddings)
55// ============================================================================
56
57/// Semantic analyzer using code embeddings
58///
59/// When the `embeddings` feature is enabled, uses the configured model path
60/// for neural network-based embeddings. Without the feature, provides
61/// heuristic-based similarity estimates.
62#[derive(Debug)]
63pub struct SemanticAnalyzer {
64    /// Path to the embedding model (used when embeddings feature is enabled)
65    #[cfg(feature = "embeddings")]
66    model_path: Option<String>,
67    /// Placeholder for non-embeddings build (maintains API compatibility)
68    #[cfg(not(feature = "embeddings"))]
69    _model_path: Option<String>,
70}
71
72impl SemanticAnalyzer {
73    /// Create a new semantic analyzer
74    pub fn new() -> Self {
75        Self {
76            #[cfg(feature = "embeddings")]
77            model_path: None,
78            #[cfg(not(feature = "embeddings"))]
79            _model_path: None,
80        }
81    }
82
83    /// Create a semantic analyzer with a custom model path
84    ///
85    /// The model path is used when the `embeddings` feature is enabled.
86    /// Without the feature, the path is stored but not used.
87    pub fn with_model(model_path: &str) -> Self {
88        Self {
89            #[cfg(feature = "embeddings")]
90            model_path: Some(model_path.to_owned()),
91            #[cfg(not(feature = "embeddings"))]
92            _model_path: Some(model_path.to_owned()),
93        }
94    }
95
96    /// Get the configured model path (if any)
97    #[cfg(feature = "embeddings")]
98    pub fn model_path(&self) -> Option<&str> {
99        self.model_path.as_deref()
100    }
101
102    /// Generate embeddings for code content
103    ///
104    /// # Current Implementation
105    ///
106    /// Uses a character-frequency based embedding algorithm that:
107    /// 1. Creates a 384-dimensional vector (matching common transformer output size)
108    /// 2. Accumulates weighted character frequencies based on position
109    /// 3. Normalizes to unit length for cosine similarity
110    ///
111    /// This is a **lightweight placeholder** that provides reasonable similarity
112    /// estimates for code without requiring ML model dependencies. It captures:
113    /// - Character distribution patterns
114    /// - Position-weighted frequency (earlier chars weighted more)
115    /// - Basic structural patterns through punctuation distribution
116    ///
117    /// For production use cases requiring high accuracy, consider integrating
118    /// actual transformer embeddings.
119    #[cfg(feature = "embeddings")]
120    pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
121        // Character-frequency based embedding (see doc comment for rationale)
122        let mut embedding = vec![0.0f32; 384];
123        for (i, c) in content.chars().enumerate() {
124            let idx = (c as usize) % 384;
125            // Position-weighted contribution: earlier characters contribute more
126            embedding[idx] += 1.0 / ((i + 1) as f32);
127        }
128        // L2 normalize for cosine similarity
129        let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
130        if norm > 0.0 {
131            for x in &mut embedding {
132                *x /= norm;
133            }
134        }
135        Ok(embedding)
136    }
137
138    /// Generate embeddings (stub when feature disabled)
139    #[cfg(not(feature = "embeddings"))]
140    pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
141        Ok(vec![0.0; 384])
142    }
143
144    /// Calculate similarity between two code snippets
145    #[cfg(feature = "embeddings")]
146    pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
147        let emb_a = self.embed(a)?;
148        let emb_b = self.embed(b)?;
149        Ok(cosine_similarity(&emb_a, &emb_b))
150    }
151
152    /// Calculate similarity (stub when feature disabled)
153    #[cfg(not(feature = "embeddings"))]
154    pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
155        Ok(0.0)
156    }
157}
158
159impl Default for SemanticAnalyzer {
160    fn default() -> Self {
161        Self::new()
162    }
163}
164
165// ============================================================================
166// Semantic Compressor (for reducing content while preserving meaning)
167// ============================================================================
168
169/// Configuration for semantic compression
170#[derive(Debug, Clone)]
171pub struct SemanticConfig {
172    /// Similarity threshold for clustering (0.0 - 1.0)
173    pub similarity_threshold: f32,
174    /// Minimum chunk size in characters
175    pub min_chunk_size: usize,
176    /// Maximum chunk size in characters
177    pub max_chunk_size: usize,
178    /// Budget ratio (0.0 - 1.0) - target size relative to original
179    pub budget_ratio: f32,
180}
181
182impl Default for SemanticConfig {
183    fn default() -> Self {
184        Self {
185            similarity_threshold: 0.7,
186            min_chunk_size: 100,
187            max_chunk_size: 2000,
188            budget_ratio: 0.5,
189        }
190    }
191}
192
193/// A chunk of code
194#[derive(Debug, Clone)]
195pub struct CodeChunk {
196    /// The original content
197    pub content: String,
198    /// Start offset in original content
199    pub start: usize,
200    /// End offset in original content
201    pub end: usize,
202    /// Embedding vector (when computed)
203    pub embedding: Option<Vec<f32>>,
204    /// Cluster assignment
205    pub cluster_id: Option<usize>,
206}
207
208/// Semantic compressor for code content
209///
210/// Uses embeddings-based clustering when the `embeddings` feature is enabled,
211/// otherwise falls back to heuristic-based compression.
212pub struct SemanticCompressor {
213    config: SemanticConfig,
214    /// Semantic analyzer for generating embeddings and computing similarity
215    analyzer: SemanticAnalyzer,
216}
217
218impl SemanticCompressor {
219    /// Create a new semantic compressor with default config
220    pub fn new() -> Self {
221        Self::with_config(SemanticConfig::default())
222    }
223
224    /// Create a new semantic compressor with custom config
225    pub fn with_config(config: SemanticConfig) -> Self {
226        Self { config, analyzer: SemanticAnalyzer::new() }
227    }
228
229    /// Get a reference to the internal semantic analyzer
230    ///
231    /// This allows access to the analyzer for similarity computations
232    /// or custom embedding operations.
233    pub fn analyzer(&self) -> &SemanticAnalyzer {
234        &self.analyzer
235    }
236
237    /// Compress content semantically
238    ///
239    /// When the `embeddings` feature is enabled, uses neural embeddings
240    /// to cluster similar code chunks and select representatives.
241    ///
242    /// Without the feature, falls back to heuristic-based compression.
243    pub fn compress(&self, content: &str) -> Result<String> {
244        // First, check for repetitive content (Bug #6 fix)
245        if let Some(compressed) = self.compress_repetitive(content) {
246            return Ok(compressed);
247        }
248
249        #[cfg(feature = "embeddings")]
250        {
251            self.compress_with_embeddings(content)
252        }
253
254        #[cfg(not(feature = "embeddings"))]
255        {
256            self.compress_heuristic(content)
257        }
258    }
259
260    /// Detect and compress repetitive content (Bug #6 fix)
261    ///
262    /// Handles cases like "sentence ".repeat(500) by detecting the repeated pattern
263    /// and returning a compressed representation.
264    ///
265    /// This function is UTF-8 safe - it only slices at valid character boundaries.
266    fn compress_repetitive(&self, content: &str) -> Option<String> {
267        // Only process content above a minimum threshold
268        if content.len() < 200 {
269            return None;
270        }
271
272        // Try to find a repeating pattern
273        // Start with small patterns and work up
274        // We iterate byte positions but only consider those that are valid UTF-8 boundaries
275        for pattern_len in 1..=100.min(content.len() / 3) {
276            // Skip if this byte position is not a valid UTF-8 character boundary
277            if !content.is_char_boundary(pattern_len) {
278                continue;
279            }
280
281            let pattern = &content[..pattern_len];
282
283            // Skip patterns that are just whitespace
284            if pattern.chars().all(|c| c.is_whitespace()) {
285                continue;
286            }
287
288            // Count how many times this pattern repeats consecutively
289            let mut count = 0;
290            let mut pos = 0;
291            while pos + pattern_len <= content.len() {
292                // Ensure both slice boundaries are valid UTF-8
293                if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
294                    break;
295                }
296                if &content[pos..pos + pattern_len] == pattern {
297                    count += 1;
298                    pos += pattern_len;
299                } else {
300                    break;
301                }
302            }
303
304            // If pattern repeats enough times and covers most of the content
305            let coverage = (count * pattern_len) as f32 / content.len() as f32;
306            if count >= 3 && coverage >= 0.8 {
307                // Calculate how many instances to keep based on budget_ratio
308                let instances_to_show = (count as f32 * self.config.budget_ratio)
309                    .ceil()
310                    .clamp(1.0, 5.0) as usize;
311
312                let shown_content = pattern.repeat(instances_to_show);
313                // Safe: count * pattern_len is already at a valid boundary (start of next pattern or end)
314                let remainder_start = count * pattern_len;
315                let remainder = if remainder_start <= content.len()
316                    && content.is_char_boundary(remainder_start)
317                {
318                    &content[remainder_start..]
319                } else {
320                    ""
321                };
322
323                let result = if remainder.is_empty() {
324                    format!(
325                        "{}\n/* ... pattern repeated {} times (showing {}) ... */",
326                        shown_content.trim_end(),
327                        count,
328                        instances_to_show
329                    )
330                } else {
331                    format!(
332                        "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
333                        shown_content.trim_end(),
334                        count,
335                        instances_to_show,
336                        remainder.trim()
337                    )
338                };
339
340                return Some(result);
341            }
342        }
343
344        // Also detect line-based repetition (same line repeated many times)
345        let lines: Vec<&str> = content.lines().collect();
346        if lines.len() >= 3 {
347            let mut line_counts: HashMap<&str, usize> = HashMap::new();
348            for line in &lines {
349                *line_counts.entry(*line).or_insert(0) += 1;
350            }
351
352            // Find the most repeated line
353            if let Some((repeated_line, count)) = line_counts
354                .iter()
355                .filter(|(line, _)| !line.trim().is_empty())
356                .max_by_key(|(_, count)| *count)
357            {
358                let repetition_ratio = *count as f32 / lines.len() as f32;
359                if *count >= 3 && repetition_ratio >= 0.5 {
360                    // Build compressed output preserving unique lines
361                    let mut result = String::new();
362                    let mut consecutive_count = 0;
363                    let mut last_was_repeated = false;
364
365                    for line in &lines {
366                        if *line == *repeated_line {
367                            consecutive_count += 1;
368                            if !last_was_repeated {
369                                if !result.is_empty() {
370                                    result.push('\n');
371                                }
372                                result.push_str(line);
373                            }
374                            last_was_repeated = true;
375                        } else {
376                            if last_was_repeated && consecutive_count > 1 {
377                                result.push_str(&format!(
378                                    "\n/* ... above line repeated {} times ... */",
379                                    consecutive_count
380                                ));
381                            }
382                            consecutive_count = 0;
383                            last_was_repeated = false;
384                            if !result.is_empty() {
385                                result.push('\n');
386                            }
387                            result.push_str(line);
388                        }
389                    }
390
391                    if last_was_repeated && consecutive_count > 1 {
392                        result.push_str(&format!(
393                            "\n/* ... above line repeated {} times ... */",
394                            consecutive_count
395                        ));
396                    }
397
398                    // Only return if we actually compressed significantly
399                    if result.len() < content.len() / 2 {
400                        return Some(result);
401                    }
402                }
403            }
404        }
405
406        None
407    }
408
409    /// Split content into semantic chunks (Bug #6 fix - handles content without \n\n)
410    fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
411        let mut chunks = Vec::new();
412        let mut current_start = 0;
413
414        // First try: Split on double newlines (paragraph-like boundaries)
415        for (i, _) in content.match_indices("\n\n") {
416            if i > current_start && i - current_start >= self.config.min_chunk_size {
417                let chunk_content = &content[current_start..i];
418                if chunk_content.len() <= self.config.max_chunk_size {
419                    chunks.push(CodeChunk {
420                        content: chunk_content.to_owned(),
421                        start: current_start,
422                        end: i,
423                        embedding: None,
424                        cluster_id: None,
425                    });
426                }
427                current_start = i + 2;
428            }
429        }
430
431        // Handle remaining content
432        if current_start < content.len() {
433            let remaining = &content[current_start..];
434            if remaining.len() >= self.config.min_chunk_size {
435                chunks.push(CodeChunk {
436                    content: remaining.to_owned(),
437                    start: current_start,
438                    end: content.len(),
439                    embedding: None,
440                    cluster_id: None,
441                });
442            }
443        }
444
445        // Fallback: If no chunks found (no \n\n separators), try single newlines
446        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
447            current_start = 0;
448            for (i, _) in content.match_indices('\n') {
449                if i > current_start && i - current_start >= self.config.min_chunk_size {
450                    let chunk_content = &content[current_start..i];
451                    if chunk_content.len() <= self.config.max_chunk_size {
452                        chunks.push(CodeChunk {
453                            content: chunk_content.to_owned(),
454                            start: current_start,
455                            end: i,
456                            embedding: None,
457                            cluster_id: None,
458                        });
459                    }
460                    current_start = i + 1;
461                }
462            }
463            // Handle remaining after single newline split
464            if current_start < content.len() {
465                let remaining = &content[current_start..];
466                if remaining.len() >= self.config.min_chunk_size {
467                    chunks.push(CodeChunk {
468                        content: remaining.to_owned(),
469                        start: current_start,
470                        end: content.len(),
471                        embedding: None,
472                        cluster_id: None,
473                    });
474                }
475            }
476        }
477
478        // Second fallback: If still no chunks, split by sentence boundaries (. followed by space)
479        if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
480            current_start = 0;
481            for (i, _) in content.match_indices(". ") {
482                if i > current_start && i - current_start >= self.config.min_chunk_size {
483                    let chunk_content = &content[current_start..=i]; // include the period
484                    if chunk_content.len() <= self.config.max_chunk_size {
485                        chunks.push(CodeChunk {
486                            content: chunk_content.to_owned(),
487                            start: current_start,
488                            end: i + 1,
489                            embedding: None,
490                            cluster_id: None,
491                        });
492                    }
493                    current_start = i + 2;
494                }
495            }
496            // Handle remaining
497            if current_start < content.len() {
498                let remaining = &content[current_start..];
499                if remaining.len() >= self.config.min_chunk_size {
500                    chunks.push(CodeChunk {
501                        content: remaining.to_owned(),
502                        start: current_start,
503                        end: content.len(),
504                        embedding: None,
505                        cluster_id: None,
506                    });
507                }
508            }
509        }
510
511        // Final fallback: If content is large but can't be split, force split by max_chunk_size
512        if chunks.is_empty() && content.len() > self.config.max_chunk_size {
513            let mut pos = 0;
514            while pos < content.len() {
515                let end = (pos + self.config.max_chunk_size).min(content.len());
516                chunks.push(CodeChunk {
517                    content: content[pos..end].to_owned(),
518                    start: pos,
519                    end,
520                    embedding: None,
521                    cluster_id: None,
522                });
523                pos = end;
524            }
525        }
526
527        chunks
528    }
529
530    /// Compress using heuristic methods (fallback when embeddings unavailable)
531    ///
532    /// Bug #4 fix: Make budget_ratio more effective for all content types
533    /// Bug fix: Ensure budget_ratio always has an effect when < 1.0
534    fn compress_heuristic(&self, content: &str) -> Result<String> {
535        let chunks = self.split_into_chunks(content);
536
537        // When no chunks can be created, apply character-level truncation based on budget_ratio.
538        // This ensures budget_ratio always has an effect, even for small/unstructured content.
539        if chunks.is_empty() {
540            // Apply truncation if:
541            // 1. budget_ratio < 1.0 (user wants compression)
542            // 2. Content is at least 10 chars (very short content passes through)
543            // 3. The truncation would actually reduce the size
544            if self.config.budget_ratio < 1.0 && content.len() >= 10 {
545                let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
546                if target_len > 0 && target_len < content.len() {
547                    // Find a safe truncation point (word/line boundary)
548                    let truncate_at = find_safe_truncation_point(content, target_len);
549                    if truncate_at < content.len() && truncate_at > 0 {
550                        let truncated = &content[..truncate_at];
551                        return Ok(format!(
552                            "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
553                            truncated.trim_end(),
554                            self.config.budget_ratio * 100.0,
555                            truncate_at,
556                            content.len()
557                        ));
558                    }
559                }
560            }
561            return Ok(content.to_owned());
562        }
563
564        // Special case: If we only have one chunk and budget_ratio < 1.0,
565        // truncate within that chunk instead of keeping it entirely
566        if chunks.len() == 1 && self.config.budget_ratio < 1.0 {
567            let chunk_content = &chunks[0].content;
568            let target_len = (chunk_content.len() as f32 * self.config.budget_ratio) as usize;
569            if target_len > 0 && target_len < chunk_content.len() {
570                let truncate_at = find_safe_truncation_point(chunk_content, target_len);
571                if truncate_at < chunk_content.len() && truncate_at > 0 {
572                    let truncated = &chunk_content[..truncate_at];
573                    return Ok(format!(
574                        "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
575                        truncated.trim_end(),
576                        self.config.budget_ratio * 100.0,
577                        truncate_at,
578                        chunk_content.len()
579                    ));
580                }
581            }
582        }
583
584        // Keep every Nth chunk based on budget ratio
585        let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
586        let step = chunks.len() / target_chunks.max(1);
587
588        let mut result = String::new();
589        let mut kept = 0;
590
591        for (i, chunk) in chunks.iter().enumerate() {
592            if i % step.max(1) == 0 && kept < target_chunks {
593                if !result.is_empty() {
594                    result.push_str("\n\n");
595                }
596                result.push_str(&chunk.content);
597                kept += 1;
598            }
599        }
600
601        // Add truncation marker if we removed content
602        if kept < chunks.len() {
603            result.push_str(&format!(
604                "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
605                chunks.len() - kept,
606                (kept as f32 / chunks.len() as f32) * 100.0
607            ));
608        }
609
610        Ok(result)
611    }
612
613    /// Compress using neural embeddings
614    #[cfg(feature = "embeddings")]
615    fn compress_with_embeddings(&self, content: &str) -> Result<String> {
616        let mut chunks = self.split_into_chunks(content);
617
618        // When no chunks can be created, apply character-level truncation based on budget_ratio.
619        // This ensures budget_ratio always has an effect, even for small/unstructured content.
620        if chunks.is_empty() {
621            // Apply truncation if:
622            // 1. budget_ratio < 1.0 (user wants compression)
623            // 2. Content is at least 10 chars (very short content passes through)
624            // 3. The truncation would actually reduce the size
625            if self.config.budget_ratio < 1.0 && content.len() >= 10 {
626                let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
627                if target_len > 0 && target_len < content.len() {
628                    // Find a safe truncation point (word/line boundary)
629                    let truncate_at = find_safe_truncation_point(content, target_len);
630                    if truncate_at < content.len() && truncate_at > 0 {
631                        let truncated = &content[..truncate_at];
632                        return Ok(format!(
633                            "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
634                            truncated.trim_end(),
635                            self.config.budget_ratio * 100.0,
636                            truncate_at,
637                            content.len()
638                        ));
639                    }
640                }
641            }
642            return Ok(content.to_owned());
643        }
644
645        // Generate embeddings for each chunk
646        for chunk in &mut chunks {
647            chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
648        }
649
650        // Cluster similar chunks
651        let clusters = self.cluster_chunks(&chunks)?;
652
653        // If everything clusters together (only 1 cluster), fall back to sampling approach
654        // since clustering doesn't provide meaningful compression in this case
655        if clusters.len() == 1 {
656            // Keep every Nth chunk based on budget ratio (same as heuristic approach)
657            let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
658            let step = chunks.len() / target_chunks.max(1);
659
660            let mut result = String::new();
661            let mut kept = 0;
662
663            for (i, chunk) in chunks.iter().enumerate() {
664                if i % step.max(1) == 0 && kept < target_chunks {
665                    if !result.is_empty() {
666                        result.push_str("\n\n");
667                    }
668                    result.push_str(&chunk.content);
669                    kept += 1;
670                }
671            }
672
673            // Add truncation marker if we removed content
674            if kept < chunks.len() {
675                result.push_str(&format!(
676                    "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
677                    chunks.len() - kept,
678                    (kept as f32 / chunks.len() as f32) * 100.0
679                ));
680            }
681
682            return Ok(result);
683        }
684
685        // Multiple clusters: select representative from each cluster
686        let mut result = String::new();
687        for cluster in clusters.values() {
688            if let Some(representative) = self.select_representative(cluster) {
689                if !result.is_empty() {
690                    result.push_str("\n\n");
691                }
692                result.push_str(&representative.content);
693            }
694        }
695
696        Ok(result)
697    }
698
699    /// Cluster chunks by embedding similarity
700    #[cfg(feature = "embeddings")]
701    fn cluster_chunks<'a>(
702        &self,
703        chunks: &'a [CodeChunk],
704    ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
705        let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
706        let mut next_cluster = 0;
707
708        for chunk in chunks {
709            let embedding = chunk
710                .embedding
711                .as_ref()
712                .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
713
714            // Find existing cluster with similar embedding
715            let mut target_cluster = None;
716            for (&cluster_id, cluster_chunks) in &clusters {
717                if let Some(first) = cluster_chunks.first() {
718                    if let Some(ref first_emb) = first.embedding {
719                        let similarity = cosine_similarity(embedding, first_emb);
720                        if similarity >= self.config.similarity_threshold {
721                            target_cluster = Some(cluster_id);
722                            break;
723                        }
724                    }
725                }
726            }
727
728            if let Some(cluster_id) = target_cluster {
729                if let Some(cluster) = clusters.get_mut(&cluster_id) {
730                    cluster.push(chunk);
731                }
732            } else {
733                clusters.insert(next_cluster, vec![chunk]);
734                next_cluster += 1;
735            }
736        }
737
738        Ok(clusters)
739    }
740
741    /// Select the best representative from a cluster
742    #[cfg(feature = "embeddings")]
743    fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
744        // Select the longest chunk as representative (most informative)
745        chunks.iter().max_by_key(|c| c.content.len()).copied()
746    }
747}
748
749impl Default for SemanticCompressor {
750    fn default() -> Self {
751        Self::new()
752    }
753}
754
755// ============================================================================
756// Honest Type Aliases
757// ============================================================================
758// The names below more accurately describe the implementation:
759// - "Semantic" implies neural/ML understanding, but we use heuristics
760// - These aliases are provided for clarity and recommended for new code
761
762/// Alias for `SemanticAnalyzer` - more honest name reflecting the actual implementation.
763///
764/// This analyzer uses character-frequency heuristics for similarity detection,
765/// NOT neural network embeddings. Use this alias when you want to be explicit
766/// about the implementation approach.
767pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
768
769/// Alias for `SemanticCompressor` - more honest name reflecting the actual implementation.
770///
771/// This compressor uses chunk-based heuristics with optional character-frequency
772/// clustering, NOT neural semantic understanding. Use this alias when you want
773/// to be explicit about the implementation approach.
774pub type HeuristicCompressor = SemanticCompressor;
775
776/// Alias for `SemanticConfig` - more honest name.
777pub type HeuristicCompressionConfig = SemanticConfig;
778
779// ============================================================================
780// Utility Functions
781// ============================================================================
782
783/// Find a safe truncation point in content (word or line boundary)
784///
785/// Used by compress_heuristic to ensure we don't cut in the middle of a word
786/// or multi-byte UTF-8 character.
787fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
788    if target_len >= content.len() {
789        return content.len();
790    }
791
792    // First, ensure we're at a valid UTF-8 boundary
793    let mut truncate_at = target_len;
794    while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
795        truncate_at -= 1;
796    }
797
798    // Try to find a line boundary (newline) near the target
799    if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
800        if newline_pos > target_len / 2 {
801            // Found a newline that's not too far back
802            return newline_pos;
803        }
804    }
805
806    // Fall back to word boundary (space)
807    if let Some(space_pos) = content[..truncate_at].rfind(' ') {
808        if space_pos > target_len / 2 {
809            return space_pos;
810        }
811    }
812
813    // No good boundary found, use the UTF-8 safe position
814    truncate_at
815}
816
817/// Compute cosine similarity between two vectors
818///
819/// Returns a value between -1.0 and 1.0, where 1.0 indicates identical
820/// direction, 0.0 indicates orthogonal vectors, and -1.0 indicates
821/// opposite direction.
822///
823/// # Note
824/// This function is used by the embeddings feature for clustering and
825/// is also tested directly. The `#[cfg_attr]` suppresses warnings in
826/// builds without the embeddings feature.
827#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
828fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
829    if a.len() != b.len() || a.is_empty() {
830        return 0.0;
831    }
832
833    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
834    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
835    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
836
837    if norm_a == 0.0 || norm_b == 0.0 {
838        return 0.0;
839    }
840
841    dot / (norm_a * norm_b)
842}
843
844// ============================================================================
845// Tests
846// ============================================================================
847
848#[cfg(test)]
849mod tests {
850    use super::*;
851
852    #[test]
853    fn test_analyzer_creation() {
854        let analyzer = SemanticAnalyzer::new();
855        // Verify analyzer is created successfully
856        // Model path is None by default (accessed via model_path() when embeddings enabled)
857        #[cfg(feature = "embeddings")]
858        assert!(analyzer.model_path().is_none());
859        #[cfg(not(feature = "embeddings"))]
860        drop(analyzer); // Explicitly drop to satisfy lint
861    }
862
863    #[test]
864    fn test_analyzer_with_model() {
865        let analyzer = SemanticAnalyzer::with_model("/path/to/model");
866        #[cfg(feature = "embeddings")]
867        assert_eq!(analyzer.model_path(), Some("/path/to/model"));
868        #[cfg(not(feature = "embeddings"))]
869        drop(analyzer); // Explicitly drop to satisfy lint
870    }
871
872    #[test]
873    fn test_compressor_analyzer_access() {
874        let compressor = SemanticCompressor::new();
875        // Verify we can access the analyzer through the compressor
876        let _analyzer = compressor.analyzer();
877    }
878
879    #[test]
880    fn test_semantic_config_default() {
881        let config = SemanticConfig::default();
882        assert_eq!(config.similarity_threshold, 0.7);
883        assert_eq!(config.budget_ratio, 0.5);
884    }
885
886    #[test]
887    fn test_split_into_chunks() {
888        let compressor = SemanticCompressor::with_config(SemanticConfig {
889            min_chunk_size: 10,
890            max_chunk_size: 1000,
891            ..Default::default()
892        });
893
894        let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
895        let chunks = compressor.split_into_chunks(content);
896        assert!(chunks.len() >= 2);
897    }
898
899    #[test]
900    fn test_heuristic_compression() {
901        let compressor = SemanticCompressor::with_config(SemanticConfig {
902            min_chunk_size: 5,
903            max_chunk_size: 100,
904            budget_ratio: 0.5,
905            ..Default::default()
906        });
907
908        let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
909        let result = compressor.compress_heuristic(content).unwrap();
910        // Should complete without error
911        assert!(!result.is_empty() || content.is_empty());
912    }
913
914    #[test]
915    fn test_empty_content() {
916        let compressor = SemanticCompressor::new();
917        let result = compressor.compress("").unwrap();
918        assert_eq!(result, "");
919    }
920
921    #[test]
922    fn test_cosine_similarity_identical() {
923        let a = vec![1.0, 0.0, 0.0];
924        let b = vec![1.0, 0.0, 0.0];
925        let sim = cosine_similarity(&a, &b);
926        assert!((sim - 1.0).abs() < 0.001);
927    }
928
929    #[test]
930    fn test_cosine_similarity_orthogonal() {
931        let a = vec![1.0, 0.0, 0.0];
932        let c = vec![0.0, 1.0, 0.0];
933        let sim = cosine_similarity(&a, &c);
934        assert!(sim.abs() < 0.001);
935    }
936
937    #[test]
938    fn test_cosine_similarity_empty() {
939        let a: Vec<f32> = vec![];
940        let b: Vec<f32> = vec![];
941        assert_eq!(cosine_similarity(&a, &b), 0.0);
942    }
943
944    // Bug #6 tests - repetitive content compression
945    #[test]
946    fn test_repetitive_pattern_compression() {
947        let compressor = SemanticCompressor::new();
948        // Test "sentence ".repeat(500) - exactly the reported bug case
949        let content = "sentence ".repeat(500);
950        let result = compressor.compress(&content).unwrap();
951
952        // Result should be significantly smaller than original
953        assert!(
954            result.len() < content.len() / 2,
955            "Compressed size {} should be less than half of original {}",
956            result.len(),
957            content.len()
958        );
959
960        // Should contain the pattern and a compression marker
961        assert!(result.contains("sentence"));
962        assert!(
963            result.contains("repeated") || result.contains("pattern"),
964            "Should indicate compression occurred"
965        );
966    }
967
968    #[test]
969    fn test_repetitive_line_compression() {
970        let compressor = SemanticCompressor::new();
971        // Test repeated lines
972        let content = "same line\n".repeat(100);
973        let result = compressor.compress(&content).unwrap();
974
975        // Result should be significantly smaller
976        assert!(
977            result.len() < content.len() / 2,
978            "Compressed size {} should be less than half of original {}",
979            result.len(),
980            content.len()
981        );
982    }
983
984    #[test]
985    fn test_non_repetitive_content_unchanged() {
986        // Use budget_ratio=1.0 to preserve content (default is 0.5 which truncates)
987        let compressor = SemanticCompressor::with_config(SemanticConfig {
988            budget_ratio: 1.0,
989            ..Default::default()
990        });
991        // Non-repetitive content should not trigger repetition compression
992        let content = "This is some unique content that does not repeat.";
993        let result = compressor.compress(content).unwrap();
994
995        // Short non-repetitive content should be returned as-is with budget_ratio=1.0
996        assert_eq!(result, content);
997    }
998
999    #[test]
1000    fn test_repetitive_with_variation() {
1001        let compressor = SemanticCompressor::with_config(SemanticConfig {
1002            budget_ratio: 0.3,
1003            ..Default::default()
1004        });
1005
1006        // Content with some repetition mixed with unique parts
1007        let mut content = String::new();
1008        for i in 0..50 {
1009            content.push_str(&format!("item {} ", i % 5)); // Repeated pattern with variation
1010        }
1011
1012        let result = compressor.compress(&content).unwrap();
1013        // This may or may not compress depending on pattern detection
1014        // Just verify it doesn't panic
1015        assert!(!result.is_empty());
1016    }
1017
1018    // UTF-8 boundary safety tests for compress_repetitive
1019    #[test]
1020    fn test_repetitive_unicode_chinese() {
1021        let compressor = SemanticCompressor::new();
1022        // Chinese characters are 3 bytes each
1023        // Create repeating Chinese pattern
1024        let content = "中文测试 ".repeat(100); // Each repeat is 13 bytes
1025        let result = compressor.compress(&content).unwrap();
1026
1027        // Should not panic and should produce valid UTF-8
1028        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1029
1030        // Should compress or return unchanged (not panic)
1031        assert!(!result.is_empty() || content.is_empty());
1032    }
1033
1034    #[test]
1035    fn test_repetitive_unicode_emoji() {
1036        let compressor = SemanticCompressor::new();
1037        // Emoji are 4 bytes each
1038        let content = "🎉🎊🎁 ".repeat(80); // Each repeat is 14 bytes
1039
1040        let result = compressor.compress(&content).unwrap();
1041        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1042        assert!(!result.is_empty() || content.is_empty());
1043    }
1044
1045    #[test]
1046    fn test_repetitive_unicode_mixed() {
1047        let compressor = SemanticCompressor::new();
1048        // Mix of 1, 2, 3, and 4 byte characters
1049        let content = "a中🎉 ".repeat(60); // Each repeat is 11 bytes
1050
1051        let result = compressor.compress(&content).unwrap();
1052        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1053        assert!(!result.is_empty() || content.is_empty());
1054    }
1055
1056    #[test]
1057    fn test_repetitive_unicode_cyrillic() {
1058        let compressor = SemanticCompressor::new();
1059        // Cyrillic characters are 2 bytes each
1060        let content = "Привет ".repeat(50);
1061
1062        let result = compressor.compress(&content).unwrap();
1063        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1064    }
1065
1066    #[test]
1067    fn test_non_repetitive_unicode_boundary() {
1068        let compressor = SemanticCompressor::new();
1069        // Content where pattern detection would try various byte lengths
1070        // that don't align with UTF-8 boundaries
1071        let content = "世界和平".repeat(60); // No spaces, pure multi-byte
1072
1073        let result = compressor.compress(&content).unwrap();
1074        // Should not panic even when pattern length iteration
1075        // hits non-UTF-8 boundaries
1076        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1077    }
1078
1079    #[test]
1080    fn test_repetitive_unicode_line_based() {
1081        let compressor = SemanticCompressor::new();
1082        // Test line-based repetition detection with Unicode
1083        let content = "中文行\n".repeat(100);
1084
1085        let result = compressor.compress(&content).unwrap();
1086        assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1087    }
1088
1089    // ==========================================================================
1090    // Additional coverage tests
1091    // ==========================================================================
1092
1093    #[test]
1094    fn test_semantic_error_display() {
1095        let err1 = SemanticError::ModelLoadError("test error".to_owned());
1096        assert!(err1.to_string().contains("Model loading failed"));
1097        assert!(err1.to_string().contains("test error"));
1098
1099        let err2 = SemanticError::EmbeddingError("embed fail".to_owned());
1100        assert!(err2.to_string().contains("Embedding generation failed"));
1101
1102        let err3 = SemanticError::ClusteringError("cluster fail".to_owned());
1103        assert!(err3.to_string().contains("Clustering failed"));
1104
1105        let err4 = SemanticError::FeatureNotEnabled;
1106        assert!(err4.to_string().contains("embeddings feature not enabled"));
1107    }
1108
1109    #[test]
1110    fn test_semantic_error_debug() {
1111        let err = SemanticError::ModelLoadError("debug test".to_owned());
1112        let debug_str = format!("{:?}", err);
1113        assert!(debug_str.contains("ModelLoadError"));
1114    }
1115
1116    #[test]
1117    fn test_semantic_analyzer_default() {
1118        let analyzer = SemanticAnalyzer::default();
1119        // Should work same as new()
1120        let result = analyzer.embed("test");
1121        assert!(result.is_ok());
1122    }
1123
1124    #[test]
1125    fn test_semantic_analyzer_debug() {
1126        let analyzer = SemanticAnalyzer::new();
1127        let debug_str = format!("{:?}", analyzer);
1128        assert!(debug_str.contains("SemanticAnalyzer"));
1129    }
1130
1131    #[test]
1132    fn test_semantic_analyzer_embed_empty() {
1133        let analyzer = SemanticAnalyzer::new();
1134        let result = analyzer.embed("").unwrap();
1135        assert_eq!(result.len(), 384);
1136    }
1137
1138    #[test]
1139    fn test_semantic_analyzer_embed_produces_384_dims() {
1140        let analyzer = SemanticAnalyzer::new();
1141        let result = analyzer.embed("some code content").unwrap();
1142        assert_eq!(result.len(), 384);
1143    }
1144
1145    #[test]
1146    fn test_semantic_analyzer_similarity_same_content() {
1147        let analyzer = SemanticAnalyzer::new();
1148        let result = analyzer.similarity("hello world", "hello world").unwrap();
1149        // Same content should have high similarity (1.0 in embeddings mode, 0.0 in fallback)
1150        #[cfg(feature = "embeddings")]
1151        assert!((result - 1.0).abs() < 0.01);
1152        #[cfg(not(feature = "embeddings"))]
1153        assert_eq!(result, 0.0);
1154    }
1155
1156    #[test]
1157    fn test_semantic_analyzer_similarity_different_content() {
1158        let analyzer = SemanticAnalyzer::new();
1159        let result = analyzer.similarity("hello", "goodbye").unwrap();
1160        // Result should be valid (0.0 in fallback mode)
1161        #[cfg(not(feature = "embeddings"))]
1162        assert_eq!(result, 0.0);
1163        #[cfg(feature = "embeddings")]
1164        assert!((-1.0..=1.0).contains(&result));
1165    }
1166
1167    #[test]
1168    fn test_semantic_config_custom() {
1169        let config = SemanticConfig {
1170            similarity_threshold: 0.9,
1171            min_chunk_size: 50,
1172            max_chunk_size: 5000,
1173            budget_ratio: 0.3,
1174        };
1175        assert_eq!(config.similarity_threshold, 0.9);
1176        assert_eq!(config.min_chunk_size, 50);
1177        assert_eq!(config.max_chunk_size, 5000);
1178        assert_eq!(config.budget_ratio, 0.3);
1179    }
1180
1181    #[test]
1182    fn test_semantic_config_clone() {
1183        let config = SemanticConfig::default();
1184        let cloned = config.clone();
1185        assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1186        assert_eq!(cloned.budget_ratio, config.budget_ratio);
1187    }
1188
1189    #[test]
1190    fn test_semantic_config_debug() {
1191        let config = SemanticConfig::default();
1192        let debug_str = format!("{:?}", config);
1193        assert!(debug_str.contains("SemanticConfig"));
1194        assert!(debug_str.contains("similarity_threshold"));
1195    }
1196
1197    #[test]
1198    fn test_code_chunk_debug() {
1199        let chunk = CodeChunk {
1200            content: "test content".to_owned(),
1201            start: 0,
1202            end: 12,
1203            embedding: None,
1204            cluster_id: None,
1205        };
1206        let debug_str = format!("{:?}", chunk);
1207        assert!(debug_str.contains("CodeChunk"));
1208        assert!(debug_str.contains("test content"));
1209    }
1210
1211    #[test]
1212    fn test_code_chunk_clone() {
1213        let chunk = CodeChunk {
1214            content: "original".to_owned(),
1215            start: 0,
1216            end: 8,
1217            embedding: Some(vec![0.1, 0.2, 0.3]),
1218            cluster_id: Some(5),
1219        };
1220        let cloned = chunk;
1221        assert_eq!(cloned.content, "original");
1222        assert_eq!(cloned.start, 0);
1223        assert_eq!(cloned.end, 8);
1224        assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1225        assert_eq!(cloned.cluster_id, Some(5));
1226    }
1227
1228    #[test]
1229    fn test_semantic_compressor_default() {
1230        let compressor = SemanticCompressor::default();
1231        let result = compressor.compress("test").unwrap();
1232        assert_eq!(result, "test");
1233    }
1234
1235    #[test]
1236    fn test_split_into_chunks_single_newline_fallback() {
1237        let compressor = SemanticCompressor::with_config(SemanticConfig {
1238            min_chunk_size: 5,
1239            max_chunk_size: 1000,
1240            ..Default::default()
1241        });
1242
1243        // Content with only single newlines (no \n\n)
1244        let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1245        let chunks = compressor.split_into_chunks(content);
1246        // Should use single newline fallback
1247        assert!(!chunks.is_empty() || content.len() < 5);
1248    }
1249
1250    #[test]
1251    fn test_split_into_chunks_sentence_fallback() {
1252        let compressor = SemanticCompressor::with_config(SemanticConfig {
1253            min_chunk_size: 10,
1254            max_chunk_size: 1000,
1255            ..Default::default()
1256        });
1257
1258        // Content with sentences but no newlines
1259        let content = "First sentence here. Second sentence here. Third sentence here.";
1260        let chunks = compressor.split_into_chunks(content);
1261        // Should use sentence boundary fallback
1262        assert!(!chunks.is_empty() || content.len() < 10);
1263    }
1264
1265    #[test]
1266    fn test_split_into_chunks_force_split() {
1267        let compressor = SemanticCompressor::with_config(SemanticConfig {
1268            min_chunk_size: 100, // Higher than content length so normal chunking fails
1269            max_chunk_size: 20,  // Lower than content length to trigger force split
1270            ..Default::default()
1271        });
1272
1273        // Content without any splitting characters, longer than max_chunk_size
1274        // but shorter than min_chunk_size (so normal chunking produces empty result)
1275        let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1276        let chunks = compressor.split_into_chunks(content);
1277        // Should force split by max_chunk_size when no other splitting works
1278        assert!(
1279            chunks.len() >= 2,
1280            "Expected at least 2 chunks from force split, got {}",
1281            chunks.len()
1282        );
1283    }
1284
1285    #[test]
1286    fn test_split_into_chunks_empty() {
1287        let compressor = SemanticCompressor::new();
1288        let chunks = compressor.split_into_chunks("");
1289        assert!(chunks.is_empty());
1290    }
1291
1292    #[test]
1293    fn test_split_into_chunks_below_min_size() {
1294        let compressor = SemanticCompressor::with_config(SemanticConfig {
1295            min_chunk_size: 100,
1296            max_chunk_size: 1000,
1297            ..Default::default()
1298        });
1299
1300        let content = "short";
1301        let chunks = compressor.split_into_chunks(content);
1302        // Content too short for min_chunk_size
1303        assert!(chunks.is_empty());
1304    }
1305
1306    #[test]
1307    fn test_compress_heuristic_empty_chunks() {
1308        let compressor = SemanticCompressor::with_config(SemanticConfig {
1309            min_chunk_size: 1000, // Force no chunks to be created
1310            budget_ratio: 1.0,    // Use 1.0 to preserve content unchanged
1311            ..Default::default()
1312        });
1313
1314        let content = "short content";
1315        let result = compressor.compress_heuristic(content).unwrap();
1316        // Should return original when no chunks created and budget_ratio=1.0
1317        assert_eq!(result, content);
1318    }
1319
1320    #[test]
1321    fn test_compress_heuristic_multiple_chunks() {
1322        let compressor = SemanticCompressor::with_config(SemanticConfig {
1323            min_chunk_size: 10,
1324            max_chunk_size: 100,
1325            budget_ratio: 0.3,
1326            ..Default::default()
1327        });
1328
1329        let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1330        let result = compressor.compress_heuristic(content).unwrap();
1331        // Should have compression marker if chunks were removed
1332        assert!(result.contains("chunk") || result.contains("compressed"));
1333    }
1334
1335    #[test]
1336    fn test_cosine_similarity_different_lengths() {
1337        let a = vec![1.0, 2.0, 3.0];
1338        let b = vec![1.0, 2.0];
1339        let sim = cosine_similarity(&a, &b);
1340        assert_eq!(sim, 0.0); // Different lengths should return 0
1341    }
1342
1343    #[test]
1344    fn test_cosine_similarity_zero_vectors() {
1345        let a = vec![0.0, 0.0, 0.0];
1346        let b = vec![1.0, 2.0, 3.0];
1347        let sim = cosine_similarity(&a, &b);
1348        assert_eq!(sim, 0.0); // Zero norm should return 0
1349    }
1350
1351    #[test]
1352    fn test_cosine_similarity_opposite() {
1353        let a = vec![1.0, 0.0, 0.0];
1354        let b = vec![-1.0, 0.0, 0.0];
1355        let sim = cosine_similarity(&a, &b);
1356        assert!((sim + 1.0).abs() < 0.001); // Opposite directions = -1.0
1357    }
1358
1359    #[test]
1360    fn test_cosine_similarity_normalized() {
1361        let a = vec![0.6, 0.8, 0.0];
1362        let b = vec![0.6, 0.8, 0.0];
1363        let sim = cosine_similarity(&a, &b);
1364        assert!((sim - 1.0).abs() < 0.001);
1365    }
1366
1367    #[test]
1368    fn test_compress_repetitive_short_content() {
1369        let compressor = SemanticCompressor::new();
1370        // Content below 200 chars should not trigger repetition compression
1371        let content = "short ".repeat(10); // 60 chars
1372        let result = compressor.compress_repetitive(&content);
1373        assert!(result.is_none());
1374    }
1375
1376    #[test]
1377    fn test_compress_repetitive_whitespace_only() {
1378        let compressor = SemanticCompressor::new();
1379        // Whitespace-only patterns should be skipped
1380        let content = "   ".repeat(100);
1381        let result = compressor.compress_repetitive(&content);
1382        // Should not compress whitespace-only patterns
1383        assert!(result.is_none());
1384    }
1385
1386    #[test]
1387    fn test_compress_repetitive_low_coverage() {
1388        let compressor = SemanticCompressor::new();
1389        // Pattern that doesn't cover 80% of content
1390        let mut content = "pattern ".repeat(5);
1391        content.push_str(&"x".repeat(200)); // Add non-repeating content
1392        let result = compressor.compress_repetitive(&content);
1393        // Low coverage should not trigger compression
1394        assert!(result.is_none());
1395    }
1396
1397    #[test]
1398    fn test_compress_repetitive_line_low_ratio() {
1399        let compressor = SemanticCompressor::new();
1400        // Lines where no single line repeats enough
1401        let content = (0..20)
1402            .map(|i| format!("unique line {}", i))
1403            .collect::<Vec<_>>()
1404            .join("\n");
1405        let result = compressor.compress_repetitive(&content);
1406        // No significant repetition
1407        assert!(result.is_none());
1408    }
1409
1410    #[test]
1411    fn test_compress_repetitive_mixed_with_unique() {
1412        let compressor = SemanticCompressor::new();
1413        // Repeated line mixed with unique lines
1414        let mut lines = vec![];
1415        for i in 0..50 {
1416            if i % 2 == 0 {
1417                lines.push("repeated line");
1418            } else {
1419                lines.push("unique line");
1420            }
1421        }
1422        let content = lines.join("\n");
1423        let result = compressor.compress(&content).unwrap();
1424        // Should handle mixed content
1425        assert!(!result.is_empty());
1426    }
1427
1428    #[test]
1429    fn test_compress_no_repetition_returns_none() {
1430        let compressor = SemanticCompressor::new();
1431        // Unique content that doesn't repeat
1432        let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1433        // Each sentence is unique enough
1434        let result = compressor.compress_repetitive(&content);
1435        // Depends on pattern length detection - may or may not find pattern
1436        // Just verify no panic
1437        drop(result);
1438    }
1439
1440    #[test]
1441    fn test_type_aliases() {
1442        // Test that type aliases work correctly
1443        let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1444        let _compressor: HeuristicCompressor = SemanticCompressor::new();
1445        let _config: HeuristicCompressionConfig = SemanticConfig::default();
1446    }
1447
1448    #[test]
1449    fn test_compress_preserves_content_structure() {
1450        let compressor = SemanticCompressor::with_config(SemanticConfig {
1451            min_chunk_size: 10,
1452            max_chunk_size: 500,
1453            budget_ratio: 1.0, // Keep everything
1454            ..Default::default()
1455        });
1456
1457        let content = "def foo():\n    pass\n\ndef bar():\n    pass";
1458        let result = compressor.compress(content).unwrap();
1459        // With budget_ratio 1.0, should keep most content
1460        assert!(result.contains("foo") || result.contains("bar"));
1461    }
1462
1463    #[test]
1464    fn test_split_chunks_respects_max_size() {
1465        let compressor = SemanticCompressor::with_config(SemanticConfig {
1466            min_chunk_size: 5,
1467            max_chunk_size: 50,
1468            ..Default::default()
1469        });
1470
1471        let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1472        let chunks = compressor.split_into_chunks(content);
1473
1474        for chunk in &chunks {
1475            assert!(chunk.content.len() <= 50, "Chunk size {} exceeds max 50", chunk.content.len());
1476        }
1477    }
1478
1479    #[test]
1480    fn test_compress_repetitive_with_remainder() {
1481        let compressor = SemanticCompressor::new();
1482        // Pattern that repeats but has a small remainder
1483        let mut content = "abc ".repeat(100);
1484        content.push_str("xyz"); // Add non-repeating remainder
1485
1486        let result = compressor.compress(&content).unwrap();
1487        // Should compress and handle remainder
1488        assert!(!result.is_empty());
1489    }
1490
1491    #[test]
1492    fn test_compressor_analyzer_method() {
1493        let compressor = SemanticCompressor::new();
1494        let analyzer = compressor.analyzer();
1495
1496        // Verify the analyzer works
1497        let embed_result = analyzer.embed("test code");
1498        assert!(embed_result.is_ok());
1499    }
1500
1501    #[test]
1502    fn test_code_chunk_with_embedding_and_cluster() {
1503        let chunk = CodeChunk {
1504            content: "fn main() {}".to_owned(),
1505            start: 0,
1506            end: 12,
1507            embedding: Some(vec![0.5; 384]),
1508            cluster_id: Some(3),
1509        };
1510
1511        assert_eq!(chunk.content, "fn main() {}");
1512        assert_eq!(chunk.start, 0);
1513        assert_eq!(chunk.end, 12);
1514        assert!(chunk.embedding.is_some());
1515        assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1516        assert_eq!(chunk.cluster_id, Some(3));
1517    }
1518
1519    #[test]
1520    fn test_compress_very_long_repetitive() {
1521        let compressor = SemanticCompressor::with_config(SemanticConfig {
1522            budget_ratio: 0.2, // Aggressive compression
1523            ..Default::default()
1524        });
1525
1526        // Very long repetitive content
1527        let content = "repeated_token ".repeat(1000);
1528        let result = compressor.compress(&content).unwrap();
1529
1530        // Should significantly compress
1531        assert!(result.len() < content.len() / 3);
1532        assert!(result.contains("repeated"));
1533    }
1534
1535    #[test]
1536    fn test_semantic_result_type_ok() {
1537        let result: Result<String> = Ok("success".to_owned());
1538        assert!(result.is_ok());
1539        assert_eq!(result.unwrap(), "success");
1540    }
1541
1542    #[test]
1543    fn test_semantic_result_type_err() {
1544        let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1545        assert!(result.is_err());
1546    }
1547
1548    // Bug #4 fix tests - budget_ratio effectiveness
1549    #[test]
1550    fn test_find_safe_truncation_point_basic() {
1551        let content = "Hello world this is a test";
1552        let point = find_safe_truncation_point(content, 15);
1553        // Should find a word boundary
1554        assert!(content.is_char_boundary(point));
1555        assert!(point <= 15 || point == content.len());
1556    }
1557
1558    #[test]
1559    fn test_find_safe_truncation_point_newline() {
1560        let content = "Line one\nLine two\nLine three";
1561        let point = find_safe_truncation_point(content, 20);
1562        // Should prefer newline boundary
1563        assert!(content.is_char_boundary(point));
1564    }
1565
1566    #[test]
1567    fn test_find_safe_truncation_point_unicode() {
1568        let content = "Hello 世界 test";
1569        let point = find_safe_truncation_point(content, 10);
1570        // Should not cut in middle of UTF-8 character
1571        assert!(content.is_char_boundary(point));
1572    }
1573
1574    #[test]
1575    fn test_find_safe_truncation_point_beyond_length() {
1576        let content = "short";
1577        let point = find_safe_truncation_point(content, 100);
1578        assert_eq!(point, content.len());
1579    }
1580
1581    #[test]
1582    fn test_budget_ratio_affects_large_content() {
1583        // Test that budget_ratio affects compression of content with paragraph breaks
1584        // This tests the chunk-based compression path
1585        let content = (0..20)
1586            .map(|i| {
1587                format!("This is paragraph number {} with some content to fill it out nicely.", i)
1588            })
1589            .collect::<Vec<_>>()
1590            .join("\n\n");
1591
1592        // Test with different budget ratios
1593        let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1594            budget_ratio: 0.3,
1595            min_chunk_size: 20,
1596            max_chunk_size: 2000,
1597            ..Default::default()
1598        });
1599
1600        let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1601            budget_ratio: 0.8,
1602            min_chunk_size: 20,
1603            max_chunk_size: 2000,
1604            ..Default::default()
1605        });
1606
1607        let result_30 = compressor_30.compress(&content).unwrap();
1608        let result_80 = compressor_80.compress(&content).unwrap();
1609
1610        // Lower budget ratio should produce shorter output
1611        assert!(
1612            result_30.len() < result_80.len(),
1613            "30% budget ({}) should be smaller than 80% budget ({})",
1614            result_30.len(),
1615            result_80.len()
1616        );
1617
1618        // Both should indicate compression occurred
1619        assert!(
1620            result_30.contains("compressed") || result_30.len() < content.len(),
1621            "30% should show compression indicator"
1622        );
1623    }
1624
1625    #[test]
1626    fn test_budget_ratio_one_returns_original() {
1627        let content = "Some content without chunk boundaries";
1628
1629        let compressor = SemanticCompressor::with_config(SemanticConfig {
1630            budget_ratio: 1.0, // Keep everything
1631            ..Default::default()
1632        });
1633
1634        let result = compressor.compress(content).unwrap();
1635        // With budget_ratio 1.0, should return original content
1636        assert_eq!(result, content);
1637    }
1638
1639    // ==========================================================================
1640    // Bug #4 Fix Tests - budget_ratio effectiveness for small content
1641    // ==========================================================================
1642
1643    /// Test that budget_ratio affects content >= 10 chars even without chunk boundaries.
1644    /// This was the bug: small content wasn't being truncated because the threshold
1645    /// was set to min_chunk_size (100) instead of a lower value (10).
1646    #[test]
1647    fn test_budget_ratio_affects_small_content() {
1648        // Content that's over 10 chars but has no chunk boundaries
1649        // Previously this wouldn't be compressed because it was under min_chunk_size
1650        let content = "This is a short test string that should be affected by budget ratio.";
1651
1652        let compressor = SemanticCompressor::with_config(SemanticConfig {
1653            budget_ratio: 0.3, // Keep only 30%
1654            min_chunk_size: 100,
1655            max_chunk_size: 2000,
1656            ..Default::default()
1657        });
1658
1659        let result = compressor.compress(content).unwrap();
1660
1661        // With budget_ratio 0.3, content should be truncated
1662        assert!(
1663            result.len() < content.len() || result.contains("truncated"),
1664            "Small content with budget_ratio=0.3 should be compressed. Original: {}, Result: {}",
1665            content.len(),
1666            result.len()
1667        );
1668    }
1669
1670    /// Test that budget_ratio 1.0 preserves small content
1671    #[test]
1672    fn test_budget_ratio_one_preserves_small_content() {
1673        let content = "Short content that should remain unchanged with budget_ratio=1.0";
1674
1675        let compressor = SemanticCompressor::with_config(SemanticConfig {
1676            budget_ratio: 1.0,
1677            min_chunk_size: 100,
1678            max_chunk_size: 2000,
1679            ..Default::default()
1680        });
1681
1682        let result = compressor.compress(content).unwrap();
1683
1684        // With budget_ratio 1.0, should return original
1685        assert_eq!(result, content, "budget_ratio=1.0 should preserve content");
1686    }
1687
1688    /// Test that very short content (< 10 chars) passes through unchanged
1689    #[test]
1690    fn test_very_short_content_unchanged() {
1691        let content = "tiny";
1692
1693        let compressor = SemanticCompressor::with_config(SemanticConfig {
1694            budget_ratio: 0.1, // Even aggressive budget shouldn't affect very short content
1695            ..Default::default()
1696        });
1697
1698        let result = compressor.compress(content).unwrap();
1699
1700        // Very short content should pass through
1701        assert_eq!(result, content, "Very short content should be unchanged");
1702    }
1703
1704    /// Test that budget_ratio affects medium content without chunk boundaries
1705    #[test]
1706    fn test_budget_ratio_medium_no_chunks() {
1707        // Content that's long enough to compress but has no paragraph breaks
1708        let content = "This is a medium length test content that has no paragraph breaks and should trigger the budget ratio truncation path because there are no chunk boundaries.";
1709
1710        let compressor = SemanticCompressor::with_config(SemanticConfig {
1711            budget_ratio: 0.5,
1712            min_chunk_size: 200, // Higher than content length
1713            max_chunk_size: 2000,
1714            ..Default::default()
1715        });
1716
1717        let result = compressor.compress(content).unwrap();
1718
1719        // Should be compressed to ~50%
1720        assert!(
1721            result.len() < content.len(),
1722            "Medium content with budget_ratio=0.5 should be compressed. Original: {}, Result: {}",
1723            content.len(),
1724            result.len()
1725        );
1726    }
1727
1728    /// Test that truncation marker includes percentage and char counts
1729    #[test]
1730    fn test_truncation_marker_format() {
1731        let content = "A sufficiently long piece of content that will definitely be truncated when we set a low budget ratio.";
1732
1733        let compressor = SemanticCompressor::with_config(SemanticConfig {
1734            budget_ratio: 0.3,
1735            min_chunk_size: 200,
1736            max_chunk_size: 2000,
1737            ..Default::default()
1738        });
1739
1740        let result = compressor.compress(content).unwrap();
1741
1742        // Should contain truncation marker with useful info
1743        if result.contains("truncated") {
1744            assert!(result.contains('%'), "Truncation marker should include percentage");
1745            assert!(result.contains("chars"), "Truncation marker should include char count");
1746        }
1747    }
1748
1749    /// Test different budget ratios produce proportionally different outputs
1750    #[test]
1751    fn test_budget_ratio_proportional() {
1752        let content = "This content is long enough to test different budget ratio values and see that they produce outputs of proportionally different sizes as expected.";
1753
1754        let compressor_20 = SemanticCompressor::with_config(SemanticConfig {
1755            budget_ratio: 0.2,
1756            min_chunk_size: 200,
1757            ..Default::default()
1758        });
1759
1760        let compressor_50 = SemanticCompressor::with_config(SemanticConfig {
1761            budget_ratio: 0.5,
1762            min_chunk_size: 200,
1763            ..Default::default()
1764        });
1765
1766        let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1767            budget_ratio: 0.8,
1768            min_chunk_size: 200,
1769            ..Default::default()
1770        });
1771
1772        let result_20 = compressor_20.compress(content).unwrap();
1773        let result_50 = compressor_50.compress(content).unwrap();
1774        let result_80 = compressor_80.compress(content).unwrap();
1775
1776        // Lower ratio should produce shorter output
1777        assert!(
1778            result_20.len() <= result_50.len(),
1779            "20% ratio ({}) should be <= 50% ratio ({})",
1780            result_20.len(),
1781            result_50.len()
1782        );
1783        assert!(
1784            result_50.len() <= result_80.len(),
1785            "50% ratio ({}) should be <= 80% ratio ({})",
1786            result_50.len(),
1787            result_80.len()
1788        );
1789    }
1790}
infiniloom_engine/semantic.rs

infiniloom_engine/
semantic.rs