1use std::collections::HashMap;
33
34pub type Result<T> = std::result::Result<T, SemanticError>;
36
37#[derive(Debug, thiserror::Error)]
39pub enum SemanticError {
40 #[error("Model loading failed: {0}")]
41 ModelLoadError(String),
42
43 #[error("Embedding generation failed: {0}")]
44 EmbeddingError(String),
45
46 #[error("Clustering failed: {0}")]
47 ClusteringError(String),
48
49 #[error("Feature not available: embeddings feature not enabled")]
50 FeatureNotEnabled,
51}
52
53#[derive(Debug)]
63pub struct SemanticAnalyzer {
64 #[cfg(feature = "embeddings")]
66 model_path: Option<String>,
67 #[cfg(not(feature = "embeddings"))]
69 _model_path: Option<String>,
70}
71
72impl SemanticAnalyzer {
73 pub fn new() -> Self {
75 Self {
76 #[cfg(feature = "embeddings")]
77 model_path: None,
78 #[cfg(not(feature = "embeddings"))]
79 _model_path: None,
80 }
81 }
82
83 pub fn with_model(model_path: &str) -> Self {
88 Self {
89 #[cfg(feature = "embeddings")]
90 model_path: Some(model_path.to_owned()),
91 #[cfg(not(feature = "embeddings"))]
92 _model_path: Some(model_path.to_owned()),
93 }
94 }
95
96 #[cfg(feature = "embeddings")]
98 pub fn model_path(&self) -> Option<&str> {
99 self.model_path.as_deref()
100 }
101
102 #[cfg(feature = "embeddings")]
120 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
121 let mut embedding = vec![0.0f32; 384];
123 for (i, c) in content.chars().enumerate() {
124 let idx = (c as usize) % 384;
125 embedding[idx] += 1.0 / ((i + 1) as f32);
127 }
128 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
130 if norm > 0.0 {
131 for x in &mut embedding {
132 *x /= norm;
133 }
134 }
135 Ok(embedding)
136 }
137
138 #[cfg(not(feature = "embeddings"))]
140 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
141 Ok(vec![0.0; 384])
142 }
143
144 #[cfg(feature = "embeddings")]
146 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
147 let emb_a = self.embed(a)?;
148 let emb_b = self.embed(b)?;
149 Ok(cosine_similarity(&emb_a, &emb_b))
150 }
151
152 #[cfg(not(feature = "embeddings"))]
154 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
155 Ok(0.0)
156 }
157}
158
159impl Default for SemanticAnalyzer {
160 fn default() -> Self {
161 Self::new()
162 }
163}
164
165#[derive(Debug, Clone)]
171pub struct SemanticConfig {
172 pub similarity_threshold: f32,
174 pub min_chunk_size: usize,
176 pub max_chunk_size: usize,
178 pub budget_ratio: f32,
180}
181
182impl Default for SemanticConfig {
183 fn default() -> Self {
184 Self {
185 similarity_threshold: 0.7,
186 min_chunk_size: 100,
187 max_chunk_size: 2000,
188 budget_ratio: 0.5,
189 }
190 }
191}
192
193#[derive(Debug, Clone)]
195pub struct CodeChunk {
196 pub content: String,
198 pub start: usize,
200 pub end: usize,
202 pub embedding: Option<Vec<f32>>,
204 pub cluster_id: Option<usize>,
206}
207
208pub struct SemanticCompressor {
213 config: SemanticConfig,
214 analyzer: SemanticAnalyzer,
216}
217
218impl SemanticCompressor {
219 pub fn new() -> Self {
221 Self::with_config(SemanticConfig::default())
222 }
223
224 pub fn with_config(config: SemanticConfig) -> Self {
226 Self { config, analyzer: SemanticAnalyzer::new() }
227 }
228
229 pub fn analyzer(&self) -> &SemanticAnalyzer {
234 &self.analyzer
235 }
236
237 pub fn compress(&self, content: &str) -> Result<String> {
244 if let Some(compressed) = self.compress_repetitive(content) {
246 return Ok(compressed);
247 }
248
249 #[cfg(feature = "embeddings")]
250 {
251 self.compress_with_embeddings(content)
252 }
253
254 #[cfg(not(feature = "embeddings"))]
255 {
256 self.compress_heuristic(content)
257 }
258 }
259
260 fn compress_repetitive(&self, content: &str) -> Option<String> {
267 if content.len() < 200 {
269 return None;
270 }
271
272 for pattern_len in 1..=100.min(content.len() / 3) {
276 if !content.is_char_boundary(pattern_len) {
278 continue;
279 }
280
281 let pattern = &content[..pattern_len];
282
283 if pattern.chars().all(|c| c.is_whitespace()) {
285 continue;
286 }
287
288 let mut count = 0;
290 let mut pos = 0;
291 while pos + pattern_len <= content.len() {
292 if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
294 break;
295 }
296 if &content[pos..pos + pattern_len] == pattern {
297 count += 1;
298 pos += pattern_len;
299 } else {
300 break;
301 }
302 }
303
304 let coverage = (count * pattern_len) as f32 / content.len() as f32;
306 if count >= 3 && coverage >= 0.8 {
307 let instances_to_show = (count as f32 * self.config.budget_ratio)
309 .ceil()
310 .clamp(1.0, 5.0) as usize;
311
312 let shown_content = pattern.repeat(instances_to_show);
313 let remainder_start = count * pattern_len;
315 let remainder = if remainder_start <= content.len()
316 && content.is_char_boundary(remainder_start)
317 {
318 &content[remainder_start..]
319 } else {
320 ""
321 };
322
323 let result = if remainder.is_empty() {
324 format!(
325 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
326 shown_content.trim_end(),
327 count,
328 instances_to_show
329 )
330 } else {
331 format!(
332 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
333 shown_content.trim_end(),
334 count,
335 instances_to_show,
336 remainder.trim()
337 )
338 };
339
340 return Some(result);
341 }
342 }
343
344 let lines: Vec<&str> = content.lines().collect();
346 if lines.len() >= 3 {
347 let mut line_counts: HashMap<&str, usize> = HashMap::new();
348 for line in &lines {
349 *line_counts.entry(*line).or_insert(0) += 1;
350 }
351
352 if let Some((repeated_line, count)) = line_counts
354 .iter()
355 .filter(|(line, _)| !line.trim().is_empty())
356 .max_by_key(|(_, count)| *count)
357 {
358 let repetition_ratio = *count as f32 / lines.len() as f32;
359 if *count >= 3 && repetition_ratio >= 0.5 {
360 let mut result = String::new();
362 let mut consecutive_count = 0;
363 let mut last_was_repeated = false;
364
365 for line in &lines {
366 if *line == *repeated_line {
367 consecutive_count += 1;
368 if !last_was_repeated {
369 if !result.is_empty() {
370 result.push('\n');
371 }
372 result.push_str(line);
373 }
374 last_was_repeated = true;
375 } else {
376 if last_was_repeated && consecutive_count > 1 {
377 result.push_str(&format!(
378 "\n/* ... above line repeated {} times ... */",
379 consecutive_count
380 ));
381 }
382 consecutive_count = 0;
383 last_was_repeated = false;
384 if !result.is_empty() {
385 result.push('\n');
386 }
387 result.push_str(line);
388 }
389 }
390
391 if last_was_repeated && consecutive_count > 1 {
392 result.push_str(&format!(
393 "\n/* ... above line repeated {} times ... */",
394 consecutive_count
395 ));
396 }
397
398 if result.len() < content.len() / 2 {
400 return Some(result);
401 }
402 }
403 }
404 }
405
406 None
407 }
408
409 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
411 let mut chunks = Vec::new();
412 let mut current_start = 0;
413
414 for (i, _) in content.match_indices("\n\n") {
416 if i > current_start && i - current_start >= self.config.min_chunk_size {
417 let chunk_content = &content[current_start..i];
418 if chunk_content.len() <= self.config.max_chunk_size {
419 chunks.push(CodeChunk {
420 content: chunk_content.to_owned(),
421 start: current_start,
422 end: i,
423 embedding: None,
424 cluster_id: None,
425 });
426 }
427 current_start = i + 2;
428 }
429 }
430
431 if current_start < content.len() {
433 let remaining = &content[current_start..];
434 if remaining.len() >= self.config.min_chunk_size {
435 chunks.push(CodeChunk {
436 content: remaining.to_owned(),
437 start: current_start,
438 end: content.len(),
439 embedding: None,
440 cluster_id: None,
441 });
442 }
443 }
444
445 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
447 current_start = 0;
448 for (i, _) in content.match_indices('\n') {
449 if i > current_start && i - current_start >= self.config.min_chunk_size {
450 let chunk_content = &content[current_start..i];
451 if chunk_content.len() <= self.config.max_chunk_size {
452 chunks.push(CodeChunk {
453 content: chunk_content.to_owned(),
454 start: current_start,
455 end: i,
456 embedding: None,
457 cluster_id: None,
458 });
459 }
460 current_start = i + 1;
461 }
462 }
463 if current_start < content.len() {
465 let remaining = &content[current_start..];
466 if remaining.len() >= self.config.min_chunk_size {
467 chunks.push(CodeChunk {
468 content: remaining.to_owned(),
469 start: current_start,
470 end: content.len(),
471 embedding: None,
472 cluster_id: None,
473 });
474 }
475 }
476 }
477
478 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
480 current_start = 0;
481 for (i, _) in content.match_indices(". ") {
482 if i > current_start && i - current_start >= self.config.min_chunk_size {
483 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
485 chunks.push(CodeChunk {
486 content: chunk_content.to_owned(),
487 start: current_start,
488 end: i + 1,
489 embedding: None,
490 cluster_id: None,
491 });
492 }
493 current_start = i + 2;
494 }
495 }
496 if current_start < content.len() {
498 let remaining = &content[current_start..];
499 if remaining.len() >= self.config.min_chunk_size {
500 chunks.push(CodeChunk {
501 content: remaining.to_owned(),
502 start: current_start,
503 end: content.len(),
504 embedding: None,
505 cluster_id: None,
506 });
507 }
508 }
509 }
510
511 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
513 let mut pos = 0;
514 while pos < content.len() {
515 let end = (pos + self.config.max_chunk_size).min(content.len());
516 chunks.push(CodeChunk {
517 content: content[pos..end].to_owned(),
518 start: pos,
519 end,
520 embedding: None,
521 cluster_id: None,
522 });
523 pos = end;
524 }
525 }
526
527 chunks
528 }
529
530 fn compress_heuristic(&self, content: &str) -> Result<String> {
535 let chunks = self.split_into_chunks(content);
536
537 if chunks.is_empty() {
540 if self.config.budget_ratio < 1.0 && content.len() >= 10 {
545 let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
546 if target_len > 0 && target_len < content.len() {
547 let truncate_at = find_safe_truncation_point(content, target_len);
549 if truncate_at < content.len() && truncate_at > 0 {
550 let truncated = &content[..truncate_at];
551 return Ok(format!(
552 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
553 truncated.trim_end(),
554 self.config.budget_ratio * 100.0,
555 truncate_at,
556 content.len()
557 ));
558 }
559 }
560 }
561 return Ok(content.to_owned());
562 }
563
564 if chunks.len() == 1 && self.config.budget_ratio < 1.0 {
567 let chunk_content = &chunks[0].content;
568 let target_len = (chunk_content.len() as f32 * self.config.budget_ratio) as usize;
569 if target_len > 0 && target_len < chunk_content.len() {
570 let truncate_at = find_safe_truncation_point(chunk_content, target_len);
571 if truncate_at < chunk_content.len() && truncate_at > 0 {
572 let truncated = &chunk_content[..truncate_at];
573 return Ok(format!(
574 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
575 truncated.trim_end(),
576 self.config.budget_ratio * 100.0,
577 truncate_at,
578 chunk_content.len()
579 ));
580 }
581 }
582 }
583
584 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
586 let step = chunks.len() / target_chunks.max(1);
587
588 let mut result = String::new();
589 let mut kept = 0;
590
591 for (i, chunk) in chunks.iter().enumerate() {
592 if i % step.max(1) == 0 && kept < target_chunks {
593 if !result.is_empty() {
594 result.push_str("\n\n");
595 }
596 result.push_str(&chunk.content);
597 kept += 1;
598 }
599 }
600
601 if kept < chunks.len() {
603 result.push_str(&format!(
604 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
605 chunks.len() - kept,
606 (kept as f32 / chunks.len() as f32) * 100.0
607 ));
608 }
609
610 Ok(result)
611 }
612
613 #[cfg(feature = "embeddings")]
615 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
616 let mut chunks = self.split_into_chunks(content);
617
618 if chunks.is_empty() {
619 return Ok(content.to_owned());
620 }
621
622 for chunk in &mut chunks {
624 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
625 }
626
627 let clusters = self.cluster_chunks(&chunks)?;
629
630 let mut result = String::new();
632 for cluster in clusters.values() {
633 if let Some(representative) = self.select_representative(cluster) {
634 if !result.is_empty() {
635 result.push_str("\n\n");
636 }
637 result.push_str(&representative.content);
638 }
639 }
640
641 Ok(result)
642 }
643
644 #[cfg(feature = "embeddings")]
646 fn cluster_chunks<'a>(
647 &self,
648 chunks: &'a [CodeChunk],
649 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
650 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
651 let mut next_cluster = 0;
652
653 for chunk in chunks {
654 let embedding = chunk
655 .embedding
656 .as_ref()
657 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
658
659 let mut target_cluster = None;
661 for (&cluster_id, cluster_chunks) in &clusters {
662 if let Some(first) = cluster_chunks.first() {
663 if let Some(ref first_emb) = first.embedding {
664 let similarity = cosine_similarity(embedding, first_emb);
665 if similarity >= self.config.similarity_threshold {
666 target_cluster = Some(cluster_id);
667 break;
668 }
669 }
670 }
671 }
672
673 if let Some(cluster_id) = target_cluster {
674 if let Some(cluster) = clusters.get_mut(&cluster_id) {
675 cluster.push(chunk);
676 }
677 } else {
678 clusters.insert(next_cluster, vec![chunk]);
679 next_cluster += 1;
680 }
681 }
682
683 Ok(clusters)
684 }
685
686 #[cfg(feature = "embeddings")]
688 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
689 chunks.iter().max_by_key(|c| c.content.len()).copied()
691 }
692}
693
694impl Default for SemanticCompressor {
695 fn default() -> Self {
696 Self::new()
697 }
698}
699
700pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
713
714pub type HeuristicCompressor = SemanticCompressor;
720
721pub type HeuristicCompressionConfig = SemanticConfig;
723
724fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
733 if target_len >= content.len() {
734 return content.len();
735 }
736
737 let mut truncate_at = target_len;
739 while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
740 truncate_at -= 1;
741 }
742
743 if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
745 if newline_pos > target_len / 2 {
746 return newline_pos;
748 }
749 }
750
751 if let Some(space_pos) = content[..truncate_at].rfind(' ') {
753 if space_pos > target_len / 2 {
754 return space_pos;
755 }
756 }
757
758 truncate_at
760}
761
762#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
773fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
774 if a.len() != b.len() || a.is_empty() {
775 return 0.0;
776 }
777
778 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
779 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
780 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
781
782 if norm_a == 0.0 || norm_b == 0.0 {
783 return 0.0;
784 }
785
786 dot / (norm_a * norm_b)
787}
788
789#[cfg(test)]
794mod tests {
795 use super::*;
796
797 #[test]
798 fn test_analyzer_creation() {
799 let analyzer = SemanticAnalyzer::new();
800 #[cfg(feature = "embeddings")]
803 assert!(analyzer.model_path().is_none());
804 #[cfg(not(feature = "embeddings"))]
805 drop(analyzer); }
807
808 #[test]
809 fn test_analyzer_with_model() {
810 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
811 #[cfg(feature = "embeddings")]
812 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
813 #[cfg(not(feature = "embeddings"))]
814 drop(analyzer); }
816
817 #[test]
818 fn test_compressor_analyzer_access() {
819 let compressor = SemanticCompressor::new();
820 let _analyzer = compressor.analyzer();
822 }
823
824 #[test]
825 fn test_semantic_config_default() {
826 let config = SemanticConfig::default();
827 assert_eq!(config.similarity_threshold, 0.7);
828 assert_eq!(config.budget_ratio, 0.5);
829 }
830
831 #[test]
832 fn test_split_into_chunks() {
833 let compressor = SemanticCompressor::with_config(SemanticConfig {
834 min_chunk_size: 10,
835 max_chunk_size: 1000,
836 ..Default::default()
837 });
838
839 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
840 let chunks = compressor.split_into_chunks(content);
841 assert!(chunks.len() >= 2);
842 }
843
844 #[test]
845 fn test_heuristic_compression() {
846 let compressor = SemanticCompressor::with_config(SemanticConfig {
847 min_chunk_size: 5,
848 max_chunk_size: 100,
849 budget_ratio: 0.5,
850 ..Default::default()
851 });
852
853 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
854 let result = compressor.compress_heuristic(content).unwrap();
855 assert!(!result.is_empty() || content.is_empty());
857 }
858
859 #[test]
860 fn test_empty_content() {
861 let compressor = SemanticCompressor::new();
862 let result = compressor.compress("").unwrap();
863 assert_eq!(result, "");
864 }
865
866 #[test]
867 fn test_cosine_similarity_identical() {
868 let a = vec![1.0, 0.0, 0.0];
869 let b = vec![1.0, 0.0, 0.0];
870 let sim = cosine_similarity(&a, &b);
871 assert!((sim - 1.0).abs() < 0.001);
872 }
873
874 #[test]
875 fn test_cosine_similarity_orthogonal() {
876 let a = vec![1.0, 0.0, 0.0];
877 let c = vec![0.0, 1.0, 0.0];
878 let sim = cosine_similarity(&a, &c);
879 assert!(sim.abs() < 0.001);
880 }
881
882 #[test]
883 fn test_cosine_similarity_empty() {
884 let a: Vec<f32> = vec![];
885 let b: Vec<f32> = vec![];
886 assert_eq!(cosine_similarity(&a, &b), 0.0);
887 }
888
889 #[test]
891 fn test_repetitive_pattern_compression() {
892 let compressor = SemanticCompressor::new();
893 let content = "sentence ".repeat(500);
895 let result = compressor.compress(&content).unwrap();
896
897 assert!(
899 result.len() < content.len() / 2,
900 "Compressed size {} should be less than half of original {}",
901 result.len(),
902 content.len()
903 );
904
905 assert!(result.contains("sentence"));
907 assert!(
908 result.contains("repeated") || result.contains("pattern"),
909 "Should indicate compression occurred"
910 );
911 }
912
913 #[test]
914 fn test_repetitive_line_compression() {
915 let compressor = SemanticCompressor::new();
916 let content = "same line\n".repeat(100);
918 let result = compressor.compress(&content).unwrap();
919
920 assert!(
922 result.len() < content.len() / 2,
923 "Compressed size {} should be less than half of original {}",
924 result.len(),
925 content.len()
926 );
927 }
928
929 #[test]
930 fn test_non_repetitive_content_unchanged() {
931 let compressor = SemanticCompressor::with_config(SemanticConfig {
933 budget_ratio: 1.0,
934 ..Default::default()
935 });
936 let content = "This is some unique content that does not repeat.";
938 let result = compressor.compress(content).unwrap();
939
940 assert_eq!(result, content);
942 }
943
944 #[test]
945 fn test_repetitive_with_variation() {
946 let compressor = SemanticCompressor::with_config(SemanticConfig {
947 budget_ratio: 0.3,
948 ..Default::default()
949 });
950
951 let mut content = String::new();
953 for i in 0..50 {
954 content.push_str(&format!("item {} ", i % 5)); }
956
957 let result = compressor.compress(&content).unwrap();
958 assert!(!result.is_empty());
961 }
962
963 #[test]
965 fn test_repetitive_unicode_chinese() {
966 let compressor = SemanticCompressor::new();
967 let content = "中文测试 ".repeat(100); let result = compressor.compress(&content).unwrap();
971
972 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
974
975 assert!(!result.is_empty() || content.is_empty());
977 }
978
979 #[test]
980 fn test_repetitive_unicode_emoji() {
981 let compressor = SemanticCompressor::new();
982 let content = "🎉🎊🎁 ".repeat(80); let result = compressor.compress(&content).unwrap();
986 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
987 assert!(!result.is_empty() || content.is_empty());
988 }
989
990 #[test]
991 fn test_repetitive_unicode_mixed() {
992 let compressor = SemanticCompressor::new();
993 let content = "a中🎉 ".repeat(60); let result = compressor.compress(&content).unwrap();
997 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
998 assert!(!result.is_empty() || content.is_empty());
999 }
1000
1001 #[test]
1002 fn test_repetitive_unicode_cyrillic() {
1003 let compressor = SemanticCompressor::new();
1004 let content = "Привет ".repeat(50);
1006
1007 let result = compressor.compress(&content).unwrap();
1008 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1009 }
1010
1011 #[test]
1012 fn test_non_repetitive_unicode_boundary() {
1013 let compressor = SemanticCompressor::new();
1014 let content = "世界和平".repeat(60); let result = compressor.compress(&content).unwrap();
1019 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1022 }
1023
1024 #[test]
1025 fn test_repetitive_unicode_line_based() {
1026 let compressor = SemanticCompressor::new();
1027 let content = "中文行\n".repeat(100);
1029
1030 let result = compressor.compress(&content).unwrap();
1031 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1032 }
1033
1034 #[test]
1039 fn test_semantic_error_display() {
1040 let err1 = SemanticError::ModelLoadError("test error".to_owned());
1041 assert!(err1.to_string().contains("Model loading failed"));
1042 assert!(err1.to_string().contains("test error"));
1043
1044 let err2 = SemanticError::EmbeddingError("embed fail".to_owned());
1045 assert!(err2.to_string().contains("Embedding generation failed"));
1046
1047 let err3 = SemanticError::ClusteringError("cluster fail".to_owned());
1048 assert!(err3.to_string().contains("Clustering failed"));
1049
1050 let err4 = SemanticError::FeatureNotEnabled;
1051 assert!(err4.to_string().contains("embeddings feature not enabled"));
1052 }
1053
1054 #[test]
1055 fn test_semantic_error_debug() {
1056 let err = SemanticError::ModelLoadError("debug test".to_owned());
1057 let debug_str = format!("{:?}", err);
1058 assert!(debug_str.contains("ModelLoadError"));
1059 }
1060
1061 #[test]
1062 fn test_semantic_analyzer_default() {
1063 let analyzer = SemanticAnalyzer::default();
1064 let result = analyzer.embed("test");
1066 assert!(result.is_ok());
1067 }
1068
1069 #[test]
1070 fn test_semantic_analyzer_debug() {
1071 let analyzer = SemanticAnalyzer::new();
1072 let debug_str = format!("{:?}", analyzer);
1073 assert!(debug_str.contains("SemanticAnalyzer"));
1074 }
1075
1076 #[test]
1077 fn test_semantic_analyzer_embed_empty() {
1078 let analyzer = SemanticAnalyzer::new();
1079 let result = analyzer.embed("").unwrap();
1080 assert_eq!(result.len(), 384);
1081 }
1082
1083 #[test]
1084 fn test_semantic_analyzer_embed_produces_384_dims() {
1085 let analyzer = SemanticAnalyzer::new();
1086 let result = analyzer.embed("some code content").unwrap();
1087 assert_eq!(result.len(), 384);
1088 }
1089
1090 #[test]
1091 fn test_semantic_analyzer_similarity_same_content() {
1092 let analyzer = SemanticAnalyzer::new();
1093 let result = analyzer.similarity("hello world", "hello world").unwrap();
1094 #[cfg(feature = "embeddings")]
1096 assert!((result - 1.0).abs() < 0.01);
1097 #[cfg(not(feature = "embeddings"))]
1098 assert_eq!(result, 0.0);
1099 }
1100
1101 #[test]
1102 fn test_semantic_analyzer_similarity_different_content() {
1103 let analyzer = SemanticAnalyzer::new();
1104 let result = analyzer.similarity("hello", "goodbye").unwrap();
1105 #[cfg(not(feature = "embeddings"))]
1107 assert_eq!(result, 0.0);
1108 #[cfg(feature = "embeddings")]
1109 assert!((-1.0..=1.0).contains(&result));
1110 }
1111
1112 #[test]
1113 fn test_semantic_config_custom() {
1114 let config = SemanticConfig {
1115 similarity_threshold: 0.9,
1116 min_chunk_size: 50,
1117 max_chunk_size: 5000,
1118 budget_ratio: 0.3,
1119 };
1120 assert_eq!(config.similarity_threshold, 0.9);
1121 assert_eq!(config.min_chunk_size, 50);
1122 assert_eq!(config.max_chunk_size, 5000);
1123 assert_eq!(config.budget_ratio, 0.3);
1124 }
1125
1126 #[test]
1127 fn test_semantic_config_clone() {
1128 let config = SemanticConfig::default();
1129 let cloned = config.clone();
1130 assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1131 assert_eq!(cloned.budget_ratio, config.budget_ratio);
1132 }
1133
1134 #[test]
1135 fn test_semantic_config_debug() {
1136 let config = SemanticConfig::default();
1137 let debug_str = format!("{:?}", config);
1138 assert!(debug_str.contains("SemanticConfig"));
1139 assert!(debug_str.contains("similarity_threshold"));
1140 }
1141
1142 #[test]
1143 fn test_code_chunk_debug() {
1144 let chunk = CodeChunk {
1145 content: "test content".to_owned(),
1146 start: 0,
1147 end: 12,
1148 embedding: None,
1149 cluster_id: None,
1150 };
1151 let debug_str = format!("{:?}", chunk);
1152 assert!(debug_str.contains("CodeChunk"));
1153 assert!(debug_str.contains("test content"));
1154 }
1155
1156 #[test]
1157 fn test_code_chunk_clone() {
1158 let chunk = CodeChunk {
1159 content: "original".to_owned(),
1160 start: 0,
1161 end: 8,
1162 embedding: Some(vec![0.1, 0.2, 0.3]),
1163 cluster_id: Some(5),
1164 };
1165 let cloned = chunk;
1166 assert_eq!(cloned.content, "original");
1167 assert_eq!(cloned.start, 0);
1168 assert_eq!(cloned.end, 8);
1169 assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1170 assert_eq!(cloned.cluster_id, Some(5));
1171 }
1172
1173 #[test]
1174 fn test_semantic_compressor_default() {
1175 let compressor = SemanticCompressor::default();
1176 let result = compressor.compress("test").unwrap();
1177 assert_eq!(result, "test");
1178 }
1179
1180 #[test]
1181 fn test_split_into_chunks_single_newline_fallback() {
1182 let compressor = SemanticCompressor::with_config(SemanticConfig {
1183 min_chunk_size: 5,
1184 max_chunk_size: 1000,
1185 ..Default::default()
1186 });
1187
1188 let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1190 let chunks = compressor.split_into_chunks(content);
1191 assert!(!chunks.is_empty() || content.len() < 5);
1193 }
1194
1195 #[test]
1196 fn test_split_into_chunks_sentence_fallback() {
1197 let compressor = SemanticCompressor::with_config(SemanticConfig {
1198 min_chunk_size: 10,
1199 max_chunk_size: 1000,
1200 ..Default::default()
1201 });
1202
1203 let content = "First sentence here. Second sentence here. Third sentence here.";
1205 let chunks = compressor.split_into_chunks(content);
1206 assert!(!chunks.is_empty() || content.len() < 10);
1208 }
1209
1210 #[test]
1211 fn test_split_into_chunks_force_split() {
1212 let compressor = SemanticCompressor::with_config(SemanticConfig {
1213 min_chunk_size: 100, max_chunk_size: 20, ..Default::default()
1216 });
1217
1218 let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1221 let chunks = compressor.split_into_chunks(content);
1222 assert!(
1224 chunks.len() >= 2,
1225 "Expected at least 2 chunks from force split, got {}",
1226 chunks.len()
1227 );
1228 }
1229
1230 #[test]
1231 fn test_split_into_chunks_empty() {
1232 let compressor = SemanticCompressor::new();
1233 let chunks = compressor.split_into_chunks("");
1234 assert!(chunks.is_empty());
1235 }
1236
1237 #[test]
1238 fn test_split_into_chunks_below_min_size() {
1239 let compressor = SemanticCompressor::with_config(SemanticConfig {
1240 min_chunk_size: 100,
1241 max_chunk_size: 1000,
1242 ..Default::default()
1243 });
1244
1245 let content = "short";
1246 let chunks = compressor.split_into_chunks(content);
1247 assert!(chunks.is_empty());
1249 }
1250
1251 #[test]
1252 fn test_compress_heuristic_empty_chunks() {
1253 let compressor = SemanticCompressor::with_config(SemanticConfig {
1254 min_chunk_size: 1000, budget_ratio: 1.0, ..Default::default()
1257 });
1258
1259 let content = "short content";
1260 let result = compressor.compress_heuristic(content).unwrap();
1261 assert_eq!(result, content);
1263 }
1264
1265 #[test]
1266 fn test_compress_heuristic_multiple_chunks() {
1267 let compressor = SemanticCompressor::with_config(SemanticConfig {
1268 min_chunk_size: 10,
1269 max_chunk_size: 100,
1270 budget_ratio: 0.3,
1271 ..Default::default()
1272 });
1273
1274 let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1275 let result = compressor.compress_heuristic(content).unwrap();
1276 assert!(result.contains("chunk") || result.contains("compressed"));
1278 }
1279
1280 #[test]
1281 fn test_cosine_similarity_different_lengths() {
1282 let a = vec![1.0, 2.0, 3.0];
1283 let b = vec![1.0, 2.0];
1284 let sim = cosine_similarity(&a, &b);
1285 assert_eq!(sim, 0.0); }
1287
1288 #[test]
1289 fn test_cosine_similarity_zero_vectors() {
1290 let a = vec![0.0, 0.0, 0.0];
1291 let b = vec![1.0, 2.0, 3.0];
1292 let sim = cosine_similarity(&a, &b);
1293 assert_eq!(sim, 0.0); }
1295
1296 #[test]
1297 fn test_cosine_similarity_opposite() {
1298 let a = vec![1.0, 0.0, 0.0];
1299 let b = vec![-1.0, 0.0, 0.0];
1300 let sim = cosine_similarity(&a, &b);
1301 assert!((sim + 1.0).abs() < 0.001); }
1303
1304 #[test]
1305 fn test_cosine_similarity_normalized() {
1306 let a = vec![0.6, 0.8, 0.0];
1307 let b = vec![0.6, 0.8, 0.0];
1308 let sim = cosine_similarity(&a, &b);
1309 assert!((sim - 1.0).abs() < 0.001);
1310 }
1311
1312 #[test]
1313 fn test_compress_repetitive_short_content() {
1314 let compressor = SemanticCompressor::new();
1315 let content = "short ".repeat(10); let result = compressor.compress_repetitive(&content);
1318 assert!(result.is_none());
1319 }
1320
1321 #[test]
1322 fn test_compress_repetitive_whitespace_only() {
1323 let compressor = SemanticCompressor::new();
1324 let content = " ".repeat(100);
1326 let result = compressor.compress_repetitive(&content);
1327 assert!(result.is_none());
1329 }
1330
1331 #[test]
1332 fn test_compress_repetitive_low_coverage() {
1333 let compressor = SemanticCompressor::new();
1334 let mut content = "pattern ".repeat(5);
1336 content.push_str(&"x".repeat(200)); let result = compressor.compress_repetitive(&content);
1338 assert!(result.is_none());
1340 }
1341
1342 #[test]
1343 fn test_compress_repetitive_line_low_ratio() {
1344 let compressor = SemanticCompressor::new();
1345 let content = (0..20)
1347 .map(|i| format!("unique line {}", i))
1348 .collect::<Vec<_>>()
1349 .join("\n");
1350 let result = compressor.compress_repetitive(&content);
1351 assert!(result.is_none());
1353 }
1354
1355 #[test]
1356 fn test_compress_repetitive_mixed_with_unique() {
1357 let compressor = SemanticCompressor::new();
1358 let mut lines = vec![];
1360 for i in 0..50 {
1361 if i % 2 == 0 {
1362 lines.push("repeated line");
1363 } else {
1364 lines.push("unique line");
1365 }
1366 }
1367 let content = lines.join("\n");
1368 let result = compressor.compress(&content).unwrap();
1369 assert!(!result.is_empty());
1371 }
1372
1373 #[test]
1374 fn test_compress_no_repetition_returns_none() {
1375 let compressor = SemanticCompressor::new();
1376 let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1378 let result = compressor.compress_repetitive(&content);
1380 drop(result);
1383 }
1384
1385 #[test]
1386 fn test_type_aliases() {
1387 let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1389 let _compressor: HeuristicCompressor = SemanticCompressor::new();
1390 let _config: HeuristicCompressionConfig = SemanticConfig::default();
1391 }
1392
1393 #[test]
1394 fn test_compress_preserves_content_structure() {
1395 let compressor = SemanticCompressor::with_config(SemanticConfig {
1396 min_chunk_size: 10,
1397 max_chunk_size: 500,
1398 budget_ratio: 1.0, ..Default::default()
1400 });
1401
1402 let content = "def foo():\n pass\n\ndef bar():\n pass";
1403 let result = compressor.compress(content).unwrap();
1404 assert!(result.contains("foo") || result.contains("bar"));
1406 }
1407
1408 #[test]
1409 fn test_split_chunks_respects_max_size() {
1410 let compressor = SemanticCompressor::with_config(SemanticConfig {
1411 min_chunk_size: 5,
1412 max_chunk_size: 50,
1413 ..Default::default()
1414 });
1415
1416 let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1417 let chunks = compressor.split_into_chunks(content);
1418
1419 for chunk in &chunks {
1420 assert!(chunk.content.len() <= 50, "Chunk size {} exceeds max 50", chunk.content.len());
1421 }
1422 }
1423
1424 #[test]
1425 fn test_compress_repetitive_with_remainder() {
1426 let compressor = SemanticCompressor::new();
1427 let mut content = "abc ".repeat(100);
1429 content.push_str("xyz"); let result = compressor.compress(&content).unwrap();
1432 assert!(!result.is_empty());
1434 }
1435
1436 #[test]
1437 fn test_compressor_analyzer_method() {
1438 let compressor = SemanticCompressor::new();
1439 let analyzer = compressor.analyzer();
1440
1441 let embed_result = analyzer.embed("test code");
1443 assert!(embed_result.is_ok());
1444 }
1445
1446 #[test]
1447 fn test_code_chunk_with_embedding_and_cluster() {
1448 let chunk = CodeChunk {
1449 content: "fn main() {}".to_owned(),
1450 start: 0,
1451 end: 12,
1452 embedding: Some(vec![0.5; 384]),
1453 cluster_id: Some(3),
1454 };
1455
1456 assert_eq!(chunk.content, "fn main() {}");
1457 assert_eq!(chunk.start, 0);
1458 assert_eq!(chunk.end, 12);
1459 assert!(chunk.embedding.is_some());
1460 assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1461 assert_eq!(chunk.cluster_id, Some(3));
1462 }
1463
1464 #[test]
1465 fn test_compress_very_long_repetitive() {
1466 let compressor = SemanticCompressor::with_config(SemanticConfig {
1467 budget_ratio: 0.2, ..Default::default()
1469 });
1470
1471 let content = "repeated_token ".repeat(1000);
1473 let result = compressor.compress(&content).unwrap();
1474
1475 assert!(result.len() < content.len() / 3);
1477 assert!(result.contains("repeated"));
1478 }
1479
1480 #[test]
1481 fn test_semantic_result_type_ok() {
1482 let result: Result<String> = Ok("success".to_owned());
1483 assert!(result.is_ok());
1484 assert_eq!(result.unwrap(), "success");
1485 }
1486
1487 #[test]
1488 fn test_semantic_result_type_err() {
1489 let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1490 assert!(result.is_err());
1491 }
1492
1493 #[test]
1495 fn test_find_safe_truncation_point_basic() {
1496 let content = "Hello world this is a test";
1497 let point = find_safe_truncation_point(content, 15);
1498 assert!(content.is_char_boundary(point));
1500 assert!(point <= 15 || point == content.len());
1501 }
1502
1503 #[test]
1504 fn test_find_safe_truncation_point_newline() {
1505 let content = "Line one\nLine two\nLine three";
1506 let point = find_safe_truncation_point(content, 20);
1507 assert!(content.is_char_boundary(point));
1509 }
1510
1511 #[test]
1512 fn test_find_safe_truncation_point_unicode() {
1513 let content = "Hello 世界 test";
1514 let point = find_safe_truncation_point(content, 10);
1515 assert!(content.is_char_boundary(point));
1517 }
1518
1519 #[test]
1520 fn test_find_safe_truncation_point_beyond_length() {
1521 let content = "short";
1522 let point = find_safe_truncation_point(content, 100);
1523 assert_eq!(point, content.len());
1524 }
1525
1526 #[test]
1527 fn test_budget_ratio_affects_large_content() {
1528 let content = (0..20)
1531 .map(|i| {
1532 format!("This is paragraph number {} with some content to fill it out nicely.", i)
1533 })
1534 .collect::<Vec<_>>()
1535 .join("\n\n");
1536
1537 let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1539 budget_ratio: 0.3,
1540 min_chunk_size: 20,
1541 max_chunk_size: 2000,
1542 ..Default::default()
1543 });
1544
1545 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1546 budget_ratio: 0.8,
1547 min_chunk_size: 20,
1548 max_chunk_size: 2000,
1549 ..Default::default()
1550 });
1551
1552 let result_30 = compressor_30.compress(&content).unwrap();
1553 let result_80 = compressor_80.compress(&content).unwrap();
1554
1555 assert!(
1557 result_30.len() < result_80.len(),
1558 "30% budget ({}) should be smaller than 80% budget ({})",
1559 result_30.len(),
1560 result_80.len()
1561 );
1562
1563 assert!(
1565 result_30.contains("compressed") || result_30.len() < content.len(),
1566 "30% should show compression indicator"
1567 );
1568 }
1569
1570 #[test]
1571 fn test_budget_ratio_one_returns_original() {
1572 let content = "Some content without chunk boundaries";
1573
1574 let compressor = SemanticCompressor::with_config(SemanticConfig {
1575 budget_ratio: 1.0, ..Default::default()
1577 });
1578
1579 let result = compressor.compress(content).unwrap();
1580 assert_eq!(result, content);
1582 }
1583
1584 #[test]
1592 fn test_budget_ratio_affects_small_content() {
1593 let content = "This is a short test string that should be affected by budget ratio.";
1596
1597 let compressor = SemanticCompressor::with_config(SemanticConfig {
1598 budget_ratio: 0.3, min_chunk_size: 100,
1600 max_chunk_size: 2000,
1601 ..Default::default()
1602 });
1603
1604 let result = compressor.compress(content).unwrap();
1605
1606 assert!(
1608 result.len() < content.len() || result.contains("truncated"),
1609 "Small content with budget_ratio=0.3 should be compressed. Original: {}, Result: {}",
1610 content.len(),
1611 result.len()
1612 );
1613 }
1614
1615 #[test]
1617 fn test_budget_ratio_one_preserves_small_content() {
1618 let content = "Short content that should remain unchanged with budget_ratio=1.0";
1619
1620 let compressor = SemanticCompressor::with_config(SemanticConfig {
1621 budget_ratio: 1.0,
1622 min_chunk_size: 100,
1623 max_chunk_size: 2000,
1624 ..Default::default()
1625 });
1626
1627 let result = compressor.compress(content).unwrap();
1628
1629 assert_eq!(result, content, "budget_ratio=1.0 should preserve content");
1631 }
1632
1633 #[test]
1635 fn test_very_short_content_unchanged() {
1636 let content = "tiny";
1637
1638 let compressor = SemanticCompressor::with_config(SemanticConfig {
1639 budget_ratio: 0.1, ..Default::default()
1641 });
1642
1643 let result = compressor.compress(content).unwrap();
1644
1645 assert_eq!(result, content, "Very short content should be unchanged");
1647 }
1648
1649 #[test]
1651 fn test_budget_ratio_medium_no_chunks() {
1652 let content = "This is a medium length test content that has no paragraph breaks and should trigger the budget ratio truncation path because there are no chunk boundaries.";
1654
1655 let compressor = SemanticCompressor::with_config(SemanticConfig {
1656 budget_ratio: 0.5,
1657 min_chunk_size: 200, max_chunk_size: 2000,
1659 ..Default::default()
1660 });
1661
1662 let result = compressor.compress(content).unwrap();
1663
1664 assert!(
1666 result.len() < content.len(),
1667 "Medium content with budget_ratio=0.5 should be compressed. Original: {}, Result: {}",
1668 content.len(),
1669 result.len()
1670 );
1671 }
1672
1673 #[test]
1675 fn test_truncation_marker_format() {
1676 let content = "A sufficiently long piece of content that will definitely be truncated when we set a low budget ratio.";
1677
1678 let compressor = SemanticCompressor::with_config(SemanticConfig {
1679 budget_ratio: 0.3,
1680 min_chunk_size: 200,
1681 max_chunk_size: 2000,
1682 ..Default::default()
1683 });
1684
1685 let result = compressor.compress(content).unwrap();
1686
1687 if result.contains("truncated") {
1689 assert!(result.contains('%'), "Truncation marker should include percentage");
1690 assert!(result.contains("chars"), "Truncation marker should include char count");
1691 }
1692 }
1693
1694 #[test]
1696 fn test_budget_ratio_proportional() {
1697 let content = "This content is long enough to test different budget ratio values and see that they produce outputs of proportionally different sizes as expected.";
1698
1699 let compressor_20 = SemanticCompressor::with_config(SemanticConfig {
1700 budget_ratio: 0.2,
1701 min_chunk_size: 200,
1702 ..Default::default()
1703 });
1704
1705 let compressor_50 = SemanticCompressor::with_config(SemanticConfig {
1706 budget_ratio: 0.5,
1707 min_chunk_size: 200,
1708 ..Default::default()
1709 });
1710
1711 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1712 budget_ratio: 0.8,
1713 min_chunk_size: 200,
1714 ..Default::default()
1715 });
1716
1717 let result_20 = compressor_20.compress(content).unwrap();
1718 let result_50 = compressor_50.compress(content).unwrap();
1719 let result_80 = compressor_80.compress(content).unwrap();
1720
1721 assert!(
1723 result_20.len() <= result_50.len(),
1724 "20% ratio ({}) should be <= 50% ratio ({})",
1725 result_20.len(),
1726 result_50.len()
1727 );
1728 assert!(
1729 result_50.len() <= result_80.len(),
1730 "50% ratio ({}) should be <= 80% ratio ({})",
1731 result_50.len(),
1732 result_80.len()
1733 );
1734 }
1735}