1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 if let Some(compressed) = self.compress_repetitive(content) {
247 return Ok(compressed);
248 }
249
250 #[cfg(feature = "embeddings")]
251 {
252 return self.compress_with_embeddings(content);
253 }
254
255 #[cfg(not(feature = "embeddings"))]
256 {
257 self.compress_heuristic(content)
258 }
259 }
260
261 fn compress_repetitive(&self, content: &str) -> Option<String> {
268 if content.len() < 200 {
270 return None;
271 }
272
273 for pattern_len in 1..=100.min(content.len() / 3) {
277 if !content.is_char_boundary(pattern_len) {
279 continue;
280 }
281
282 let pattern = &content[..pattern_len];
283
284 if pattern.chars().all(|c| c.is_whitespace()) {
286 continue;
287 }
288
289 let mut count = 0;
291 let mut pos = 0;
292 while pos + pattern_len <= content.len() {
293 if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295 break;
296 }
297 if &content[pos..pos + pattern_len] == pattern {
298 count += 1;
299 pos += pattern_len;
300 } else {
301 break;
302 }
303 }
304
305 let coverage = (count * pattern_len) as f32 / content.len() as f32;
307 if count >= 3 && coverage >= 0.8 {
308 let instances_to_show = (count as f32 * self.config.budget_ratio)
310 .ceil()
311 .clamp(1.0, 5.0) as usize;
312
313 let shown_content = pattern.repeat(instances_to_show);
314 let remainder_start = count * pattern_len;
316 let remainder = if remainder_start <= content.len()
317 && content.is_char_boundary(remainder_start)
318 {
319 &content[remainder_start..]
320 } else {
321 ""
322 };
323
324 let result = if remainder.is_empty() {
325 format!(
326 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
327 shown_content.trim_end(),
328 count,
329 instances_to_show
330 )
331 } else {
332 format!(
333 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
334 shown_content.trim_end(),
335 count,
336 instances_to_show,
337 remainder.trim()
338 )
339 };
340
341 return Some(result);
342 }
343 }
344
345 let lines: Vec<&str> = content.lines().collect();
347 if lines.len() >= 3 {
348 let mut line_counts: std::collections::HashMap<&str, usize> =
349 std::collections::HashMap::new();
350 for line in &lines {
351 *line_counts.entry(*line).or_insert(0) += 1;
352 }
353
354 if let Some((repeated_line, count)) = line_counts
356 .iter()
357 .filter(|(line, _)| !line.trim().is_empty())
358 .max_by_key(|(_, count)| *count)
359 {
360 let repetition_ratio = *count as f32 / lines.len() as f32;
361 if *count >= 3 && repetition_ratio >= 0.5 {
362 let mut result = String::new();
364 let mut consecutive_count = 0;
365 let mut last_was_repeated = false;
366
367 for line in &lines {
368 if *line == *repeated_line {
369 consecutive_count += 1;
370 if !last_was_repeated {
371 if !result.is_empty() {
372 result.push('\n');
373 }
374 result.push_str(line);
375 }
376 last_was_repeated = true;
377 } else {
378 if last_was_repeated && consecutive_count > 1 {
379 result.push_str(&format!(
380 "\n/* ... above line repeated {} times ... */",
381 consecutive_count
382 ));
383 }
384 consecutive_count = 0;
385 last_was_repeated = false;
386 if !result.is_empty() {
387 result.push('\n');
388 }
389 result.push_str(line);
390 }
391 }
392
393 if last_was_repeated && consecutive_count > 1 {
394 result.push_str(&format!(
395 "\n/* ... above line repeated {} times ... */",
396 consecutive_count
397 ));
398 }
399
400 if result.len() < content.len() / 2 {
402 return Some(result);
403 }
404 }
405 }
406 }
407
408 None
409 }
410
411 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
413 let mut chunks = Vec::new();
414 let mut current_start = 0;
415
416 for (i, _) in content.match_indices("\n\n") {
418 if i > current_start && i - current_start >= self.config.min_chunk_size {
419 let chunk_content = &content[current_start..i];
420 if chunk_content.len() <= self.config.max_chunk_size {
421 chunks.push(CodeChunk {
422 content: chunk_content.to_owned(),
423 start: current_start,
424 end: i,
425 embedding: None,
426 cluster_id: None,
427 });
428 }
429 current_start = i + 2;
430 }
431 }
432
433 if current_start < content.len() {
435 let remaining = &content[current_start..];
436 if remaining.len() >= self.config.min_chunk_size {
437 chunks.push(CodeChunk {
438 content: remaining.to_owned(),
439 start: current_start,
440 end: content.len(),
441 embedding: None,
442 cluster_id: None,
443 });
444 }
445 }
446
447 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
449 current_start = 0;
450 for (i, _) in content.match_indices('\n') {
451 if i > current_start && i - current_start >= self.config.min_chunk_size {
452 let chunk_content = &content[current_start..i];
453 if chunk_content.len() <= self.config.max_chunk_size {
454 chunks.push(CodeChunk {
455 content: chunk_content.to_owned(),
456 start: current_start,
457 end: i,
458 embedding: None,
459 cluster_id: None,
460 });
461 }
462 current_start = i + 1;
463 }
464 }
465 if current_start < content.len() {
467 let remaining = &content[current_start..];
468 if remaining.len() >= self.config.min_chunk_size {
469 chunks.push(CodeChunk {
470 content: remaining.to_owned(),
471 start: current_start,
472 end: content.len(),
473 embedding: None,
474 cluster_id: None,
475 });
476 }
477 }
478 }
479
480 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
482 current_start = 0;
483 for (i, _) in content.match_indices(". ") {
484 if i > current_start && i - current_start >= self.config.min_chunk_size {
485 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
487 chunks.push(CodeChunk {
488 content: chunk_content.to_owned(),
489 start: current_start,
490 end: i + 1,
491 embedding: None,
492 cluster_id: None,
493 });
494 }
495 current_start = i + 2;
496 }
497 }
498 if current_start < content.len() {
500 let remaining = &content[current_start..];
501 if remaining.len() >= self.config.min_chunk_size {
502 chunks.push(CodeChunk {
503 content: remaining.to_owned(),
504 start: current_start,
505 end: content.len(),
506 embedding: None,
507 cluster_id: None,
508 });
509 }
510 }
511 }
512
513 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
515 let mut pos = 0;
516 while pos < content.len() {
517 let end = (pos + self.config.max_chunk_size).min(content.len());
518 chunks.push(CodeChunk {
519 content: content[pos..end].to_owned(),
520 start: pos,
521 end,
522 embedding: None,
523 cluster_id: None,
524 });
525 pos = end;
526 }
527 }
528
529 chunks
530 }
531
532 fn compress_heuristic(&self, content: &str) -> Result<String> {
537 let chunks = self.split_into_chunks(content);
538
539 if chunks.is_empty() {
542 if self.config.budget_ratio < 1.0 && content.len() >= 10 {
547 let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
548 if target_len > 0 && target_len < content.len() {
549 let truncate_at = find_safe_truncation_point(content, target_len);
551 if truncate_at < content.len() && truncate_at > 0 {
552 let truncated = &content[..truncate_at];
553 return Ok(format!(
554 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
555 truncated.trim_end(),
556 self.config.budget_ratio * 100.0,
557 truncate_at,
558 content.len()
559 ));
560 }
561 }
562 }
563 return Ok(content.to_owned());
564 }
565
566 if chunks.len() == 1 && self.config.budget_ratio < 1.0 {
569 let chunk_content = &chunks[0].content;
570 let target_len = (chunk_content.len() as f32 * self.config.budget_ratio) as usize;
571 if target_len > 0 && target_len < chunk_content.len() {
572 let truncate_at = find_safe_truncation_point(chunk_content, target_len);
573 if truncate_at < chunk_content.len() && truncate_at > 0 {
574 let truncated = &chunk_content[..truncate_at];
575 return Ok(format!(
576 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
577 truncated.trim_end(),
578 self.config.budget_ratio * 100.0,
579 truncate_at,
580 chunk_content.len()
581 ));
582 }
583 }
584 }
585
586 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
588 let step = chunks.len() / target_chunks.max(1);
589
590 let mut result = String::new();
591 let mut kept = 0;
592
593 for (i, chunk) in chunks.iter().enumerate() {
594 if i % step.max(1) == 0 && kept < target_chunks {
595 if !result.is_empty() {
596 result.push_str("\n\n");
597 }
598 result.push_str(&chunk.content);
599 kept += 1;
600 }
601 }
602
603 if kept < chunks.len() {
605 result.push_str(&format!(
606 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
607 chunks.len() - kept,
608 (kept as f32 / chunks.len() as f32) * 100.0
609 ));
610 }
611
612 Ok(result)
613 }
614
615 #[cfg(feature = "embeddings")]
617 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
618 let mut chunks = self.split_into_chunks(content);
619
620 if chunks.is_empty() {
621 return Ok(content.to_owned());
622 }
623
624 for chunk in &mut chunks {
626 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
627 }
628
629 let clusters = self.cluster_chunks(&chunks)?;
631
632 let mut result = String::new();
634 for cluster in clusters.values() {
635 if let Some(representative) = self.select_representative(cluster) {
636 if !result.is_empty() {
637 result.push_str("\n\n");
638 }
639 result.push_str(&representative.content);
640 }
641 }
642
643 Ok(result)
644 }
645
646 #[cfg(feature = "embeddings")]
648 fn cluster_chunks<'a>(
649 &self,
650 chunks: &'a [CodeChunk],
651 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
652 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
653 let mut next_cluster = 0;
654
655 for chunk in chunks {
656 let embedding = chunk
657 .embedding
658 .as_ref()
659 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
660
661 let mut target_cluster = None;
663 for (&cluster_id, cluster_chunks) in &clusters {
664 if let Some(first) = cluster_chunks.first() {
665 if let Some(ref first_emb) = first.embedding {
666 let similarity = cosine_similarity(embedding, first_emb);
667 if similarity >= self.config.similarity_threshold {
668 target_cluster = Some(cluster_id);
669 break;
670 }
671 }
672 }
673 }
674
675 if let Some(cluster_id) = target_cluster {
676 if let Some(cluster) = clusters.get_mut(&cluster_id) {
677 cluster.push(chunk);
678 }
679 } else {
680 clusters.insert(next_cluster, vec![chunk]);
681 next_cluster += 1;
682 }
683 }
684
685 Ok(clusters)
686 }
687
688 #[cfg(feature = "embeddings")]
690 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
691 chunks.iter().max_by_key(|c| c.content.len()).copied()
693 }
694}
695
696impl Default for SemanticCompressor {
697 fn default() -> Self {
698 Self::new()
699 }
700}
701
702pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
715
716pub type HeuristicCompressor = SemanticCompressor;
722
723pub type HeuristicCompressionConfig = SemanticConfig;
725
726fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
735 if target_len >= content.len() {
736 return content.len();
737 }
738
739 let mut truncate_at = target_len;
741 while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
742 truncate_at -= 1;
743 }
744
745 if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
747 if newline_pos > target_len / 2 {
748 return newline_pos;
750 }
751 }
752
753 if let Some(space_pos) = content[..truncate_at].rfind(' ') {
755 if space_pos > target_len / 2 {
756 return space_pos;
757 }
758 }
759
760 truncate_at
762}
763
764#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
775fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
776 if a.len() != b.len() || a.is_empty() {
777 return 0.0;
778 }
779
780 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
781 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
782 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
783
784 if norm_a == 0.0 || norm_b == 0.0 {
785 return 0.0;
786 }
787
788 dot / (norm_a * norm_b)
789}
790
791#[cfg(test)]
796mod tests {
797 use super::*;
798
799 #[test]
800 fn test_analyzer_creation() {
801 let analyzer = SemanticAnalyzer::new();
802 #[cfg(feature = "embeddings")]
805 assert!(analyzer.model_path().is_none());
806 #[cfg(not(feature = "embeddings"))]
807 drop(analyzer); }
809
810 #[test]
811 fn test_analyzer_with_model() {
812 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
813 #[cfg(feature = "embeddings")]
814 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
815 #[cfg(not(feature = "embeddings"))]
816 drop(analyzer); }
818
819 #[test]
820 fn test_compressor_analyzer_access() {
821 let compressor = SemanticCompressor::new();
822 let _analyzer = compressor.analyzer();
824 }
825
826 #[test]
827 fn test_semantic_config_default() {
828 let config = SemanticConfig::default();
829 assert_eq!(config.similarity_threshold, 0.7);
830 assert_eq!(config.budget_ratio, 0.5);
831 }
832
833 #[test]
834 fn test_split_into_chunks() {
835 let compressor = SemanticCompressor::with_config(SemanticConfig {
836 min_chunk_size: 10,
837 max_chunk_size: 1000,
838 ..Default::default()
839 });
840
841 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
842 let chunks = compressor.split_into_chunks(content);
843 assert!(chunks.len() >= 2);
844 }
845
846 #[test]
847 fn test_heuristic_compression() {
848 let compressor = SemanticCompressor::with_config(SemanticConfig {
849 min_chunk_size: 5,
850 max_chunk_size: 100,
851 budget_ratio: 0.5,
852 ..Default::default()
853 });
854
855 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
856 let result = compressor.compress_heuristic(content).unwrap();
857 assert!(!result.is_empty() || content.is_empty());
859 }
860
861 #[test]
862 fn test_empty_content() {
863 let compressor = SemanticCompressor::new();
864 let result = compressor.compress("").unwrap();
865 assert_eq!(result, "");
866 }
867
868 #[test]
869 fn test_cosine_similarity_identical() {
870 let a = vec![1.0, 0.0, 0.0];
871 let b = vec![1.0, 0.0, 0.0];
872 let sim = cosine_similarity(&a, &b);
873 assert!((sim - 1.0).abs() < 0.001);
874 }
875
876 #[test]
877 fn test_cosine_similarity_orthogonal() {
878 let a = vec![1.0, 0.0, 0.0];
879 let c = vec![0.0, 1.0, 0.0];
880 let sim = cosine_similarity(&a, &c);
881 assert!(sim.abs() < 0.001);
882 }
883
884 #[test]
885 fn test_cosine_similarity_empty() {
886 let a: Vec<f32> = vec![];
887 let b: Vec<f32> = vec![];
888 assert_eq!(cosine_similarity(&a, &b), 0.0);
889 }
890
891 #[test]
893 fn test_repetitive_pattern_compression() {
894 let compressor = SemanticCompressor::new();
895 let content = "sentence ".repeat(500);
897 let result = compressor.compress(&content).unwrap();
898
899 assert!(
901 result.len() < content.len() / 2,
902 "Compressed size {} should be less than half of original {}",
903 result.len(),
904 content.len()
905 );
906
907 assert!(result.contains("sentence"));
909 assert!(
910 result.contains("repeated") || result.contains("pattern"),
911 "Should indicate compression occurred"
912 );
913 }
914
915 #[test]
916 fn test_repetitive_line_compression() {
917 let compressor = SemanticCompressor::new();
918 let content = "same line\n".repeat(100);
920 let result = compressor.compress(&content).unwrap();
921
922 assert!(
924 result.len() < content.len() / 2,
925 "Compressed size {} should be less than half of original {}",
926 result.len(),
927 content.len()
928 );
929 }
930
931 #[test]
932 fn test_non_repetitive_content_unchanged() {
933 let compressor = SemanticCompressor::with_config(SemanticConfig {
935 budget_ratio: 1.0,
936 ..Default::default()
937 });
938 let content = "This is some unique content that does not repeat.";
940 let result = compressor.compress(content).unwrap();
941
942 assert_eq!(result, content);
944 }
945
946 #[test]
947 fn test_repetitive_with_variation() {
948 let compressor = SemanticCompressor::with_config(SemanticConfig {
949 budget_ratio: 0.3,
950 ..Default::default()
951 });
952
953 let mut content = String::new();
955 for i in 0..50 {
956 content.push_str(&format!("item {} ", i % 5)); }
958
959 let result = compressor.compress(&content).unwrap();
960 assert!(!result.is_empty());
963 }
964
965 #[test]
967 fn test_repetitive_unicode_chinese() {
968 let compressor = SemanticCompressor::new();
969 let content = "中文测试 ".repeat(100); let result = compressor.compress(&content).unwrap();
973
974 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
976
977 assert!(!result.is_empty() || content.is_empty());
979 }
980
981 #[test]
982 fn test_repetitive_unicode_emoji() {
983 let compressor = SemanticCompressor::new();
984 let content = "🎉🎊🎁 ".repeat(80); let result = compressor.compress(&content).unwrap();
988 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
989 assert!(!result.is_empty() || content.is_empty());
990 }
991
992 #[test]
993 fn test_repetitive_unicode_mixed() {
994 let compressor = SemanticCompressor::new();
995 let content = "a中🎉 ".repeat(60); let result = compressor.compress(&content).unwrap();
999 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1000 assert!(!result.is_empty() || content.is_empty());
1001 }
1002
1003 #[test]
1004 fn test_repetitive_unicode_cyrillic() {
1005 let compressor = SemanticCompressor::new();
1006 let content = "Привет ".repeat(50);
1008
1009 let result = compressor.compress(&content).unwrap();
1010 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1011 }
1012
1013 #[test]
1014 fn test_non_repetitive_unicode_boundary() {
1015 let compressor = SemanticCompressor::new();
1016 let content = "世界和平".repeat(60); let result = compressor.compress(&content).unwrap();
1021 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1024 }
1025
1026 #[test]
1027 fn test_repetitive_unicode_line_based() {
1028 let compressor = SemanticCompressor::new();
1029 let content = "中文行\n".repeat(100);
1031
1032 let result = compressor.compress(&content).unwrap();
1033 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1034 }
1035
1036 #[test]
1041 fn test_semantic_error_display() {
1042 let err1 = SemanticError::ModelLoadError("test error".to_string());
1043 assert!(err1.to_string().contains("Model loading failed"));
1044 assert!(err1.to_string().contains("test error"));
1045
1046 let err2 = SemanticError::EmbeddingError("embed fail".to_string());
1047 assert!(err2.to_string().contains("Embedding generation failed"));
1048
1049 let err3 = SemanticError::ClusteringError("cluster fail".to_string());
1050 assert!(err3.to_string().contains("Clustering failed"));
1051
1052 let err4 = SemanticError::FeatureNotEnabled;
1053 assert!(err4.to_string().contains("embeddings feature not enabled"));
1054 }
1055
1056 #[test]
1057 fn test_semantic_error_debug() {
1058 let err = SemanticError::ModelLoadError("debug test".to_string());
1059 let debug_str = format!("{:?}", err);
1060 assert!(debug_str.contains("ModelLoadError"));
1061 }
1062
1063 #[test]
1064 fn test_semantic_analyzer_default() {
1065 let analyzer = SemanticAnalyzer::default();
1066 let result = analyzer.embed("test");
1068 assert!(result.is_ok());
1069 }
1070
1071 #[test]
1072 fn test_semantic_analyzer_debug() {
1073 let analyzer = SemanticAnalyzer::new();
1074 let debug_str = format!("{:?}", analyzer);
1075 assert!(debug_str.contains("SemanticAnalyzer"));
1076 }
1077
1078 #[test]
1079 fn test_semantic_analyzer_embed_empty() {
1080 let analyzer = SemanticAnalyzer::new();
1081 let result = analyzer.embed("").unwrap();
1082 assert_eq!(result.len(), 384);
1083 }
1084
1085 #[test]
1086 fn test_semantic_analyzer_embed_produces_384_dims() {
1087 let analyzer = SemanticAnalyzer::new();
1088 let result = analyzer.embed("some code content").unwrap();
1089 assert_eq!(result.len(), 384);
1090 }
1091
1092 #[test]
1093 fn test_semantic_analyzer_similarity_same_content() {
1094 let analyzer = SemanticAnalyzer::new();
1095 let result = analyzer.similarity("hello world", "hello world").unwrap();
1096 #[cfg(feature = "embeddings")]
1098 assert!((result - 1.0).abs() < 0.01);
1099 #[cfg(not(feature = "embeddings"))]
1100 assert_eq!(result, 0.0);
1101 }
1102
1103 #[test]
1104 fn test_semantic_analyzer_similarity_different_content() {
1105 let analyzer = SemanticAnalyzer::new();
1106 let result = analyzer.similarity("hello", "goodbye").unwrap();
1107 #[cfg(not(feature = "embeddings"))]
1109 assert_eq!(result, 0.0);
1110 #[cfg(feature = "embeddings")]
1111 assert!(result >= -1.0 && result <= 1.0);
1112 }
1113
1114 #[test]
1115 fn test_semantic_config_custom() {
1116 let config = SemanticConfig {
1117 similarity_threshold: 0.9,
1118 min_chunk_size: 50,
1119 max_chunk_size: 5000,
1120 budget_ratio: 0.3,
1121 };
1122 assert_eq!(config.similarity_threshold, 0.9);
1123 assert_eq!(config.min_chunk_size, 50);
1124 assert_eq!(config.max_chunk_size, 5000);
1125 assert_eq!(config.budget_ratio, 0.3);
1126 }
1127
1128 #[test]
1129 fn test_semantic_config_clone() {
1130 let config = SemanticConfig::default();
1131 let cloned = config.clone();
1132 assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1133 assert_eq!(cloned.budget_ratio, config.budget_ratio);
1134 }
1135
1136 #[test]
1137 fn test_semantic_config_debug() {
1138 let config = SemanticConfig::default();
1139 let debug_str = format!("{:?}", config);
1140 assert!(debug_str.contains("SemanticConfig"));
1141 assert!(debug_str.contains("similarity_threshold"));
1142 }
1143
1144 #[test]
1145 fn test_code_chunk_debug() {
1146 let chunk = CodeChunk {
1147 content: "test content".to_string(),
1148 start: 0,
1149 end: 12,
1150 embedding: None,
1151 cluster_id: None,
1152 };
1153 let debug_str = format!("{:?}", chunk);
1154 assert!(debug_str.contains("CodeChunk"));
1155 assert!(debug_str.contains("test content"));
1156 }
1157
1158 #[test]
1159 fn test_code_chunk_clone() {
1160 let chunk = CodeChunk {
1161 content: "original".to_string(),
1162 start: 0,
1163 end: 8,
1164 embedding: Some(vec![0.1, 0.2, 0.3]),
1165 cluster_id: Some(5),
1166 };
1167 let cloned = chunk.clone();
1168 assert_eq!(cloned.content, "original");
1169 assert_eq!(cloned.start, 0);
1170 assert_eq!(cloned.end, 8);
1171 assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1172 assert_eq!(cloned.cluster_id, Some(5));
1173 }
1174
1175 #[test]
1176 fn test_semantic_compressor_default() {
1177 let compressor = SemanticCompressor::default();
1178 let result = compressor.compress("test").unwrap();
1179 assert_eq!(result, "test");
1180 }
1181
1182 #[test]
1183 fn test_split_into_chunks_single_newline_fallback() {
1184 let compressor = SemanticCompressor::with_config(SemanticConfig {
1185 min_chunk_size: 5,
1186 max_chunk_size: 1000,
1187 ..Default::default()
1188 });
1189
1190 let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1192 let chunks = compressor.split_into_chunks(content);
1193 assert!(!chunks.is_empty() || content.len() < 5);
1195 }
1196
1197 #[test]
1198 fn test_split_into_chunks_sentence_fallback() {
1199 let compressor = SemanticCompressor::with_config(SemanticConfig {
1200 min_chunk_size: 10,
1201 max_chunk_size: 1000,
1202 ..Default::default()
1203 });
1204
1205 let content = "First sentence here. Second sentence here. Third sentence here.";
1207 let chunks = compressor.split_into_chunks(content);
1208 assert!(!chunks.is_empty() || content.len() < 10);
1210 }
1211
1212 #[test]
1213 fn test_split_into_chunks_force_split() {
1214 let compressor = SemanticCompressor::with_config(SemanticConfig {
1215 min_chunk_size: 100, max_chunk_size: 20, ..Default::default()
1218 });
1219
1220 let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1223 let chunks = compressor.split_into_chunks(content);
1224 assert!(
1226 chunks.len() >= 2,
1227 "Expected at least 2 chunks from force split, got {}",
1228 chunks.len()
1229 );
1230 }
1231
1232 #[test]
1233 fn test_split_into_chunks_empty() {
1234 let compressor = SemanticCompressor::new();
1235 let chunks = compressor.split_into_chunks("");
1236 assert!(chunks.is_empty());
1237 }
1238
1239 #[test]
1240 fn test_split_into_chunks_below_min_size() {
1241 let compressor = SemanticCompressor::with_config(SemanticConfig {
1242 min_chunk_size: 100,
1243 max_chunk_size: 1000,
1244 ..Default::default()
1245 });
1246
1247 let content = "short";
1248 let chunks = compressor.split_into_chunks(content);
1249 assert!(chunks.is_empty());
1251 }
1252
1253 #[test]
1254 fn test_compress_heuristic_empty_chunks() {
1255 let compressor = SemanticCompressor::with_config(SemanticConfig {
1256 min_chunk_size: 1000, budget_ratio: 1.0, ..Default::default()
1259 });
1260
1261 let content = "short content";
1262 let result = compressor.compress_heuristic(content).unwrap();
1263 assert_eq!(result, content);
1265 }
1266
1267 #[test]
1268 fn test_compress_heuristic_multiple_chunks() {
1269 let compressor = SemanticCompressor::with_config(SemanticConfig {
1270 min_chunk_size: 10,
1271 max_chunk_size: 100,
1272 budget_ratio: 0.3,
1273 ..Default::default()
1274 });
1275
1276 let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1277 let result = compressor.compress_heuristic(content).unwrap();
1278 assert!(result.contains("chunk") || result.contains("compressed"));
1280 }
1281
1282 #[test]
1283 fn test_cosine_similarity_different_lengths() {
1284 let a = vec![1.0, 2.0, 3.0];
1285 let b = vec![1.0, 2.0];
1286 let sim = cosine_similarity(&a, &b);
1287 assert_eq!(sim, 0.0); }
1289
1290 #[test]
1291 fn test_cosine_similarity_zero_vectors() {
1292 let a = vec![0.0, 0.0, 0.0];
1293 let b = vec![1.0, 2.0, 3.0];
1294 let sim = cosine_similarity(&a, &b);
1295 assert_eq!(sim, 0.0); }
1297
1298 #[test]
1299 fn test_cosine_similarity_opposite() {
1300 let a = vec![1.0, 0.0, 0.0];
1301 let b = vec![-1.0, 0.0, 0.0];
1302 let sim = cosine_similarity(&a, &b);
1303 assert!((sim + 1.0).abs() < 0.001); }
1305
1306 #[test]
1307 fn test_cosine_similarity_normalized() {
1308 let a = vec![0.6, 0.8, 0.0];
1309 let b = vec![0.6, 0.8, 0.0];
1310 let sim = cosine_similarity(&a, &b);
1311 assert!((sim - 1.0).abs() < 0.001);
1312 }
1313
1314 #[test]
1315 fn test_compress_repetitive_short_content() {
1316 let compressor = SemanticCompressor::new();
1317 let content = "short ".repeat(10); let result = compressor.compress_repetitive(&content);
1320 assert!(result.is_none());
1321 }
1322
1323 #[test]
1324 fn test_compress_repetitive_whitespace_only() {
1325 let compressor = SemanticCompressor::new();
1326 let content = " ".repeat(100);
1328 let result = compressor.compress_repetitive(&content);
1329 assert!(result.is_none());
1331 }
1332
1333 #[test]
1334 fn test_compress_repetitive_low_coverage() {
1335 let compressor = SemanticCompressor::new();
1336 let mut content = "pattern ".repeat(5);
1338 content.push_str(&"x".repeat(200)); let result = compressor.compress_repetitive(&content);
1340 assert!(result.is_none());
1342 }
1343
1344 #[test]
1345 fn test_compress_repetitive_line_low_ratio() {
1346 let compressor = SemanticCompressor::new();
1347 let content = (0..20)
1349 .map(|i| format!("unique line {}", i))
1350 .collect::<Vec<_>>()
1351 .join("\n");
1352 let result = compressor.compress_repetitive(&content);
1353 assert!(result.is_none());
1355 }
1356
1357 #[test]
1358 fn test_compress_repetitive_mixed_with_unique() {
1359 let compressor = SemanticCompressor::new();
1360 let mut lines = vec![];
1362 for i in 0..50 {
1363 if i % 2 == 0 {
1364 lines.push("repeated line");
1365 } else {
1366 lines.push("unique line");
1367 }
1368 }
1369 let content = lines.join("\n");
1370 let result = compressor.compress(&content).unwrap();
1371 assert!(!result.is_empty());
1373 }
1374
1375 #[test]
1376 fn test_compress_no_repetition_returns_none() {
1377 let compressor = SemanticCompressor::new();
1378 let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1380 let result = compressor.compress_repetitive(&content);
1382 drop(result);
1385 }
1386
1387 #[test]
1388 fn test_type_aliases() {
1389 let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1391 let _compressor: HeuristicCompressor = SemanticCompressor::new();
1392 let _config: HeuristicCompressionConfig = SemanticConfig::default();
1393 }
1394
1395 #[test]
1396 fn test_compress_preserves_content_structure() {
1397 let compressor = SemanticCompressor::with_config(SemanticConfig {
1398 min_chunk_size: 10,
1399 max_chunk_size: 500,
1400 budget_ratio: 1.0, ..Default::default()
1402 });
1403
1404 let content = "def foo():\n pass\n\ndef bar():\n pass";
1405 let result = compressor.compress(content).unwrap();
1406 assert!(result.contains("foo") || result.contains("bar"));
1408 }
1409
1410 #[test]
1411 fn test_split_chunks_respects_max_size() {
1412 let compressor = SemanticCompressor::with_config(SemanticConfig {
1413 min_chunk_size: 5,
1414 max_chunk_size: 50,
1415 ..Default::default()
1416 });
1417
1418 let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1419 let chunks = compressor.split_into_chunks(content);
1420
1421 for chunk in &chunks {
1422 assert!(chunk.content.len() <= 50, "Chunk size {} exceeds max 50", chunk.content.len());
1423 }
1424 }
1425
1426 #[test]
1427 fn test_compress_repetitive_with_remainder() {
1428 let compressor = SemanticCompressor::new();
1429 let mut content = "abc ".repeat(100);
1431 content.push_str("xyz"); let result = compressor.compress(&content).unwrap();
1434 assert!(!result.is_empty());
1436 }
1437
1438 #[test]
1439 fn test_compressor_analyzer_method() {
1440 let compressor = SemanticCompressor::new();
1441 let analyzer = compressor.analyzer();
1442
1443 let embed_result = analyzer.embed("test code");
1445 assert!(embed_result.is_ok());
1446 }
1447
1448 #[test]
1449 fn test_code_chunk_with_embedding_and_cluster() {
1450 let chunk = CodeChunk {
1451 content: "fn main() {}".to_string(),
1452 start: 0,
1453 end: 12,
1454 embedding: Some(vec![0.5; 384]),
1455 cluster_id: Some(3),
1456 };
1457
1458 assert_eq!(chunk.content, "fn main() {}");
1459 assert_eq!(chunk.start, 0);
1460 assert_eq!(chunk.end, 12);
1461 assert!(chunk.embedding.is_some());
1462 assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1463 assert_eq!(chunk.cluster_id, Some(3));
1464 }
1465
1466 #[test]
1467 fn test_compress_very_long_repetitive() {
1468 let compressor = SemanticCompressor::with_config(SemanticConfig {
1469 budget_ratio: 0.2, ..Default::default()
1471 });
1472
1473 let content = "repeated_token ".repeat(1000);
1475 let result = compressor.compress(&content).unwrap();
1476
1477 assert!(result.len() < content.len() / 3);
1479 assert!(result.contains("repeated"));
1480 }
1481
1482 #[test]
1483 fn test_semantic_result_type_ok() {
1484 let result: Result<String> = Ok("success".to_string());
1485 assert!(result.is_ok());
1486 assert_eq!(result.unwrap(), "success");
1487 }
1488
1489 #[test]
1490 fn test_semantic_result_type_err() {
1491 let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1492 assert!(result.is_err());
1493 }
1494
1495 #[test]
1497 fn test_find_safe_truncation_point_basic() {
1498 let content = "Hello world this is a test";
1499 let point = find_safe_truncation_point(content, 15);
1500 assert!(content.is_char_boundary(point));
1502 assert!(point <= 15 || point == content.len());
1503 }
1504
1505 #[test]
1506 fn test_find_safe_truncation_point_newline() {
1507 let content = "Line one\nLine two\nLine three";
1508 let point = find_safe_truncation_point(content, 20);
1509 assert!(content.is_char_boundary(point));
1511 }
1512
1513 #[test]
1514 fn test_find_safe_truncation_point_unicode() {
1515 let content = "Hello 世界 test";
1516 let point = find_safe_truncation_point(content, 10);
1517 assert!(content.is_char_boundary(point));
1519 }
1520
1521 #[test]
1522 fn test_find_safe_truncation_point_beyond_length() {
1523 let content = "short";
1524 let point = find_safe_truncation_point(content, 100);
1525 assert_eq!(point, content.len());
1526 }
1527
1528 #[test]
1529 fn test_budget_ratio_affects_large_content() {
1530 let content = (0..20)
1533 .map(|i| {
1534 format!("This is paragraph number {} with some content to fill it out nicely.", i)
1535 })
1536 .collect::<Vec<_>>()
1537 .join("\n\n");
1538
1539 let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1541 budget_ratio: 0.3,
1542 min_chunk_size: 20,
1543 max_chunk_size: 2000,
1544 ..Default::default()
1545 });
1546
1547 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1548 budget_ratio: 0.8,
1549 min_chunk_size: 20,
1550 max_chunk_size: 2000,
1551 ..Default::default()
1552 });
1553
1554 let result_30 = compressor_30.compress(&content).unwrap();
1555 let result_80 = compressor_80.compress(&content).unwrap();
1556
1557 assert!(
1559 result_30.len() < result_80.len(),
1560 "30% budget ({}) should be smaller than 80% budget ({})",
1561 result_30.len(),
1562 result_80.len()
1563 );
1564
1565 assert!(
1567 result_30.contains("compressed") || result_30.len() < content.len(),
1568 "30% should show compression indicator"
1569 );
1570 }
1571
1572 #[test]
1573 fn test_budget_ratio_one_returns_original() {
1574 let content = "Some content without chunk boundaries";
1575
1576 let compressor = SemanticCompressor::with_config(SemanticConfig {
1577 budget_ratio: 1.0, ..Default::default()
1579 });
1580
1581 let result = compressor.compress(content).unwrap();
1582 assert_eq!(result, content);
1584 }
1585
1586 #[test]
1594 fn test_budget_ratio_affects_small_content() {
1595 let content = "This is a short test string that should be affected by budget ratio.";
1598
1599 let compressor = SemanticCompressor::with_config(SemanticConfig {
1600 budget_ratio: 0.3, min_chunk_size: 100,
1602 max_chunk_size: 2000,
1603 ..Default::default()
1604 });
1605
1606 let result = compressor.compress(content).unwrap();
1607
1608 assert!(
1610 result.len() < content.len() || result.contains("truncated"),
1611 "Small content with budget_ratio=0.3 should be compressed. Original: {}, Result: {}",
1612 content.len(),
1613 result.len()
1614 );
1615 }
1616
1617 #[test]
1619 fn test_budget_ratio_one_preserves_small_content() {
1620 let content = "Short content that should remain unchanged with budget_ratio=1.0";
1621
1622 let compressor = SemanticCompressor::with_config(SemanticConfig {
1623 budget_ratio: 1.0,
1624 min_chunk_size: 100,
1625 max_chunk_size: 2000,
1626 ..Default::default()
1627 });
1628
1629 let result = compressor.compress(content).unwrap();
1630
1631 assert_eq!(result, content, "budget_ratio=1.0 should preserve content");
1633 }
1634
1635 #[test]
1637 fn test_very_short_content_unchanged() {
1638 let content = "tiny";
1639
1640 let compressor = SemanticCompressor::with_config(SemanticConfig {
1641 budget_ratio: 0.1, ..Default::default()
1643 });
1644
1645 let result = compressor.compress(content).unwrap();
1646
1647 assert_eq!(result, content, "Very short content should be unchanged");
1649 }
1650
1651 #[test]
1653 fn test_budget_ratio_medium_no_chunks() {
1654 let content = "This is a medium length test content that has no paragraph breaks and should trigger the budget ratio truncation path because there are no chunk boundaries.";
1656
1657 let compressor = SemanticCompressor::with_config(SemanticConfig {
1658 budget_ratio: 0.5,
1659 min_chunk_size: 200, max_chunk_size: 2000,
1661 ..Default::default()
1662 });
1663
1664 let result = compressor.compress(content).unwrap();
1665
1666 assert!(
1668 result.len() < content.len(),
1669 "Medium content with budget_ratio=0.5 should be compressed. Original: {}, Result: {}",
1670 content.len(),
1671 result.len()
1672 );
1673 }
1674
1675 #[test]
1677 fn test_truncation_marker_format() {
1678 let content = "A sufficiently long piece of content that will definitely be truncated when we set a low budget ratio.";
1679
1680 let compressor = SemanticCompressor::with_config(SemanticConfig {
1681 budget_ratio: 0.3,
1682 min_chunk_size: 200,
1683 max_chunk_size: 2000,
1684 ..Default::default()
1685 });
1686
1687 let result = compressor.compress(content).unwrap();
1688
1689 if result.contains("truncated") {
1691 assert!(result.contains("%"), "Truncation marker should include percentage");
1692 assert!(result.contains("chars"), "Truncation marker should include char count");
1693 }
1694 }
1695
1696 #[test]
1698 fn test_budget_ratio_proportional() {
1699 let content = "This content is long enough to test different budget ratio values and see that they produce outputs of proportionally different sizes as expected.";
1700
1701 let compressor_20 = SemanticCompressor::with_config(SemanticConfig {
1702 budget_ratio: 0.2,
1703 min_chunk_size: 200,
1704 ..Default::default()
1705 });
1706
1707 let compressor_50 = SemanticCompressor::with_config(SemanticConfig {
1708 budget_ratio: 0.5,
1709 min_chunk_size: 200,
1710 ..Default::default()
1711 });
1712
1713 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1714 budget_ratio: 0.8,
1715 min_chunk_size: 200,
1716 ..Default::default()
1717 });
1718
1719 let result_20 = compressor_20.compress(content).unwrap();
1720 let result_50 = compressor_50.compress(content).unwrap();
1721 let result_80 = compressor_80.compress(content).unwrap();
1722
1723 assert!(
1725 result_20.len() <= result_50.len(),
1726 "20% ratio ({}) should be <= 50% ratio ({})",
1727 result_20.len(),
1728 result_50.len()
1729 );
1730 assert!(
1731 result_50.len() <= result_80.len(),
1732 "50% ratio ({}) should be <= 80% ratio ({})",
1733 result_50.len(),
1734 result_80.len()
1735 );
1736 }
1737}