1use std::collections::HashMap;
33
34pub type Result<T> = std::result::Result<T, SemanticError>;
36
37#[derive(Debug, thiserror::Error)]
39pub enum SemanticError {
40 #[error("Model loading failed: {0}")]
41 ModelLoadError(String),
42
43 #[error("Embedding generation failed: {0}")]
44 EmbeddingError(String),
45
46 #[error("Clustering failed: {0}")]
47 ClusteringError(String),
48
49 #[error("Feature not available: embeddings feature not enabled")]
50 FeatureNotEnabled,
51}
52
53#[derive(Debug)]
63pub struct SemanticAnalyzer {
64 #[cfg(feature = "embeddings")]
66 model_path: Option<String>,
67 #[cfg(not(feature = "embeddings"))]
69 _model_path: Option<String>,
70}
71
72impl SemanticAnalyzer {
73 pub fn new() -> Self {
75 Self {
76 #[cfg(feature = "embeddings")]
77 model_path: None,
78 #[cfg(not(feature = "embeddings"))]
79 _model_path: None,
80 }
81 }
82
83 pub fn with_model(model_path: &str) -> Self {
88 Self {
89 #[cfg(feature = "embeddings")]
90 model_path: Some(model_path.to_owned()),
91 #[cfg(not(feature = "embeddings"))]
92 _model_path: Some(model_path.to_owned()),
93 }
94 }
95
96 #[cfg(feature = "embeddings")]
98 pub fn model_path(&self) -> Option<&str> {
99 self.model_path.as_deref()
100 }
101
102 #[cfg(feature = "embeddings")]
120 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
121 let mut embedding = vec![0.0f32; 384];
123 for (i, c) in content.chars().enumerate() {
124 let idx = (c as usize) % 384;
125 embedding[idx] += 1.0 / ((i + 1) as f32);
127 }
128 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
130 if norm > 0.0 {
131 for x in &mut embedding {
132 *x /= norm;
133 }
134 }
135 Ok(embedding)
136 }
137
138 #[cfg(not(feature = "embeddings"))]
140 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
141 Ok(vec![0.0; 384])
142 }
143
144 #[cfg(feature = "embeddings")]
146 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
147 let emb_a = self.embed(a)?;
148 let emb_b = self.embed(b)?;
149 Ok(cosine_similarity(&emb_a, &emb_b))
150 }
151
152 #[cfg(not(feature = "embeddings"))]
154 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
155 Ok(0.0)
156 }
157}
158
159impl Default for SemanticAnalyzer {
160 fn default() -> Self {
161 Self::new()
162 }
163}
164
165#[derive(Debug, Clone)]
171pub struct SemanticConfig {
172 pub similarity_threshold: f32,
174 pub min_chunk_size: usize,
176 pub max_chunk_size: usize,
178 pub budget_ratio: f32,
180}
181
182impl Default for SemanticConfig {
183 fn default() -> Self {
184 Self {
185 similarity_threshold: 0.7,
186 min_chunk_size: 100,
187 max_chunk_size: 2000,
188 budget_ratio: 0.5,
189 }
190 }
191}
192
193#[derive(Debug, Clone)]
195pub struct CodeChunk {
196 pub content: String,
198 pub start: usize,
200 pub end: usize,
202 pub embedding: Option<Vec<f32>>,
204 pub cluster_id: Option<usize>,
206}
207
208pub struct SemanticCompressor {
213 config: SemanticConfig,
214 analyzer: SemanticAnalyzer,
216}
217
218impl SemanticCompressor {
219 pub fn new() -> Self {
221 Self::with_config(SemanticConfig::default())
222 }
223
224 pub fn with_config(config: SemanticConfig) -> Self {
226 Self { config, analyzer: SemanticAnalyzer::new() }
227 }
228
229 pub fn analyzer(&self) -> &SemanticAnalyzer {
234 &self.analyzer
235 }
236
237 pub fn compress(&self, content: &str) -> Result<String> {
244 if let Some(compressed) = self.compress_repetitive(content) {
246 return Ok(compressed);
247 }
248
249 #[cfg(feature = "embeddings")]
250 {
251 self.compress_with_embeddings(content)
252 }
253
254 #[cfg(not(feature = "embeddings"))]
255 {
256 self.compress_heuristic(content)
257 }
258 }
259
260 fn compress_repetitive(&self, content: &str) -> Option<String> {
267 if content.len() < 200 {
269 return None;
270 }
271
272 for pattern_len in 1..=100.min(content.len() / 3) {
276 if !content.is_char_boundary(pattern_len) {
278 continue;
279 }
280
281 let pattern = &content[..pattern_len];
282
283 if pattern.chars().all(|c| c.is_whitespace()) {
285 continue;
286 }
287
288 let mut count = 0;
290 let mut pos = 0;
291 while pos + pattern_len <= content.len() {
292 if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
294 break;
295 }
296 if &content[pos..pos + pattern_len] == pattern {
297 count += 1;
298 pos += pattern_len;
299 } else {
300 break;
301 }
302 }
303
304 let coverage = (count * pattern_len) as f32 / content.len() as f32;
306 if count >= 3 && coverage >= 0.8 {
307 let instances_to_show = (count as f32 * self.config.budget_ratio)
309 .ceil()
310 .clamp(1.0, 5.0) as usize;
311
312 let shown_content = pattern.repeat(instances_to_show);
313 let remainder_start = count * pattern_len;
315 let remainder = if remainder_start <= content.len()
316 && content.is_char_boundary(remainder_start)
317 {
318 &content[remainder_start..]
319 } else {
320 ""
321 };
322
323 let result = if remainder.is_empty() {
324 format!(
325 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
326 shown_content.trim_end(),
327 count,
328 instances_to_show
329 )
330 } else {
331 format!(
332 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
333 shown_content.trim_end(),
334 count,
335 instances_to_show,
336 remainder.trim()
337 )
338 };
339
340 return Some(result);
341 }
342 }
343
344 let lines: Vec<&str> = content.lines().collect();
346 if lines.len() >= 3 {
347 let mut line_counts: HashMap<&str, usize> = HashMap::new();
348 for line in &lines {
349 *line_counts.entry(*line).or_insert(0) += 1;
350 }
351
352 if let Some((repeated_line, count)) = line_counts
354 .iter()
355 .filter(|(line, _)| !line.trim().is_empty())
356 .max_by_key(|(_, count)| *count)
357 {
358 let repetition_ratio = *count as f32 / lines.len() as f32;
359 if *count >= 3 && repetition_ratio >= 0.5 {
360 let mut result = String::new();
362 let mut consecutive_count = 0;
363 let mut last_was_repeated = false;
364
365 for line in &lines {
366 if *line == *repeated_line {
367 consecutive_count += 1;
368 if !last_was_repeated {
369 if !result.is_empty() {
370 result.push('\n');
371 }
372 result.push_str(line);
373 }
374 last_was_repeated = true;
375 } else {
376 if last_was_repeated && consecutive_count > 1 {
377 result.push_str(&format!(
378 "\n/* ... above line repeated {} times ... */",
379 consecutive_count
380 ));
381 }
382 consecutive_count = 0;
383 last_was_repeated = false;
384 if !result.is_empty() {
385 result.push('\n');
386 }
387 result.push_str(line);
388 }
389 }
390
391 if last_was_repeated && consecutive_count > 1 {
392 result.push_str(&format!(
393 "\n/* ... above line repeated {} times ... */",
394 consecutive_count
395 ));
396 }
397
398 if result.len() < content.len() / 2 {
400 return Some(result);
401 }
402 }
403 }
404 }
405
406 None
407 }
408
409 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
411 let mut chunks = Vec::new();
412 let mut current_start = 0;
413
414 for (i, _) in content.match_indices("\n\n") {
416 if i > current_start && i - current_start >= self.config.min_chunk_size {
417 let chunk_content = &content[current_start..i];
418 if chunk_content.len() <= self.config.max_chunk_size {
419 chunks.push(CodeChunk {
420 content: chunk_content.to_owned(),
421 start: current_start,
422 end: i,
423 embedding: None,
424 cluster_id: None,
425 });
426 }
427 current_start = i + 2;
428 }
429 }
430
431 if current_start < content.len() {
433 let remaining = &content[current_start..];
434 if remaining.len() >= self.config.min_chunk_size {
435 chunks.push(CodeChunk {
436 content: remaining.to_owned(),
437 start: current_start,
438 end: content.len(),
439 embedding: None,
440 cluster_id: None,
441 });
442 }
443 }
444
445 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
447 current_start = 0;
448 for (i, _) in content.match_indices('\n') {
449 if i > current_start && i - current_start >= self.config.min_chunk_size {
450 let chunk_content = &content[current_start..i];
451 if chunk_content.len() <= self.config.max_chunk_size {
452 chunks.push(CodeChunk {
453 content: chunk_content.to_owned(),
454 start: current_start,
455 end: i,
456 embedding: None,
457 cluster_id: None,
458 });
459 }
460 current_start = i + 1;
461 }
462 }
463 if current_start < content.len() {
465 let remaining = &content[current_start..];
466 if remaining.len() >= self.config.min_chunk_size {
467 chunks.push(CodeChunk {
468 content: remaining.to_owned(),
469 start: current_start,
470 end: content.len(),
471 embedding: None,
472 cluster_id: None,
473 });
474 }
475 }
476 }
477
478 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
480 current_start = 0;
481 for (i, _) in content.match_indices(". ") {
482 if i > current_start && i - current_start >= self.config.min_chunk_size {
483 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
485 chunks.push(CodeChunk {
486 content: chunk_content.to_owned(),
487 start: current_start,
488 end: i + 1,
489 embedding: None,
490 cluster_id: None,
491 });
492 }
493 current_start = i + 2;
494 }
495 }
496 if current_start < content.len() {
498 let remaining = &content[current_start..];
499 if remaining.len() >= self.config.min_chunk_size {
500 chunks.push(CodeChunk {
501 content: remaining.to_owned(),
502 start: current_start,
503 end: content.len(),
504 embedding: None,
505 cluster_id: None,
506 });
507 }
508 }
509 }
510
511 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
513 let mut pos = 0;
514 while pos < content.len() {
515 let end = (pos + self.config.max_chunk_size).min(content.len());
516 chunks.push(CodeChunk {
517 content: content[pos..end].to_owned(),
518 start: pos,
519 end,
520 embedding: None,
521 cluster_id: None,
522 });
523 pos = end;
524 }
525 }
526
527 chunks
528 }
529
530 fn compress_heuristic(&self, content: &str) -> Result<String> {
535 let chunks = self.split_into_chunks(content);
536
537 if chunks.is_empty() {
540 if self.config.budget_ratio < 1.0 && content.len() >= 10 {
545 let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
546 if target_len > 0 && target_len < content.len() {
547 let truncate_at = find_safe_truncation_point(content, target_len);
549 if truncate_at < content.len() && truncate_at > 0 {
550 let truncated = &content[..truncate_at];
551 return Ok(format!(
552 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
553 truncated.trim_end(),
554 self.config.budget_ratio * 100.0,
555 truncate_at,
556 content.len()
557 ));
558 }
559 }
560 }
561 return Ok(content.to_owned());
562 }
563
564 if chunks.len() == 1 && self.config.budget_ratio < 1.0 {
567 let chunk_content = &chunks[0].content;
568 let target_len = (chunk_content.len() as f32 * self.config.budget_ratio) as usize;
569 if target_len > 0 && target_len < chunk_content.len() {
570 let truncate_at = find_safe_truncation_point(chunk_content, target_len);
571 if truncate_at < chunk_content.len() && truncate_at > 0 {
572 let truncated = &chunk_content[..truncate_at];
573 return Ok(format!(
574 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
575 truncated.trim_end(),
576 self.config.budget_ratio * 100.0,
577 truncate_at,
578 chunk_content.len()
579 ));
580 }
581 }
582 }
583
584 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
586 let step = chunks.len() / target_chunks.max(1);
587
588 let mut result = String::new();
589 let mut kept = 0;
590
591 for (i, chunk) in chunks.iter().enumerate() {
592 if i % step.max(1) == 0 && kept < target_chunks {
593 if !result.is_empty() {
594 result.push_str("\n\n");
595 }
596 result.push_str(&chunk.content);
597 kept += 1;
598 }
599 }
600
601 if kept < chunks.len() {
603 result.push_str(&format!(
604 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
605 chunks.len() - kept,
606 (kept as f32 / chunks.len() as f32) * 100.0
607 ));
608 }
609
610 Ok(result)
611 }
612
613 #[cfg(feature = "embeddings")]
615 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
616 let mut chunks = self.split_into_chunks(content);
617
618 if chunks.is_empty() {
621 if self.config.budget_ratio < 1.0 && content.len() >= 10 {
626 let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
627 if target_len > 0 && target_len < content.len() {
628 let truncate_at = find_safe_truncation_point(content, target_len);
630 if truncate_at < content.len() && truncate_at > 0 {
631 let truncated = &content[..truncate_at];
632 return Ok(format!(
633 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
634 truncated.trim_end(),
635 self.config.budget_ratio * 100.0,
636 truncate_at,
637 content.len()
638 ));
639 }
640 }
641 }
642 return Ok(content.to_owned());
643 }
644
645 for chunk in &mut chunks {
647 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
648 }
649
650 let clusters = self.cluster_chunks(&chunks)?;
652
653 if clusters.len() == 1 {
656 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
658 let step = chunks.len() / target_chunks.max(1);
659
660 let mut result = String::new();
661 let mut kept = 0;
662
663 for (i, chunk) in chunks.iter().enumerate() {
664 if i % step.max(1) == 0 && kept < target_chunks {
665 if !result.is_empty() {
666 result.push_str("\n\n");
667 }
668 result.push_str(&chunk.content);
669 kept += 1;
670 }
671 }
672
673 if kept < chunks.len() {
675 result.push_str(&format!(
676 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
677 chunks.len() - kept,
678 (kept as f32 / chunks.len() as f32) * 100.0
679 ));
680 }
681
682 return Ok(result);
683 }
684
685 let mut result = String::new();
687 for cluster in clusters.values() {
688 if let Some(representative) = self.select_representative(cluster) {
689 if !result.is_empty() {
690 result.push_str("\n\n");
691 }
692 result.push_str(&representative.content);
693 }
694 }
695
696 Ok(result)
697 }
698
699 #[cfg(feature = "embeddings")]
701 fn cluster_chunks<'a>(
702 &self,
703 chunks: &'a [CodeChunk],
704 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
705 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
706 let mut next_cluster = 0;
707
708 for chunk in chunks {
709 let embedding = chunk
710 .embedding
711 .as_ref()
712 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
713
714 let mut target_cluster = None;
716 for (&cluster_id, cluster_chunks) in &clusters {
717 if let Some(first) = cluster_chunks.first() {
718 if let Some(ref first_emb) = first.embedding {
719 let similarity = cosine_similarity(embedding, first_emb);
720 if similarity >= self.config.similarity_threshold {
721 target_cluster = Some(cluster_id);
722 break;
723 }
724 }
725 }
726 }
727
728 if let Some(cluster_id) = target_cluster {
729 if let Some(cluster) = clusters.get_mut(&cluster_id) {
730 cluster.push(chunk);
731 }
732 } else {
733 clusters.insert(next_cluster, vec![chunk]);
734 next_cluster += 1;
735 }
736 }
737
738 Ok(clusters)
739 }
740
741 #[cfg(feature = "embeddings")]
743 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
744 chunks.iter().max_by_key(|c| c.content.len()).copied()
746 }
747}
748
749impl Default for SemanticCompressor {
750 fn default() -> Self {
751 Self::new()
752 }
753}
754
755pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
768
769pub type HeuristicCompressor = SemanticCompressor;
775
776pub type HeuristicCompressionConfig = SemanticConfig;
778
779fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
788 if target_len >= content.len() {
789 return content.len();
790 }
791
792 let mut truncate_at = target_len;
794 while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
795 truncate_at -= 1;
796 }
797
798 if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
800 if newline_pos > target_len / 2 {
801 return newline_pos;
803 }
804 }
805
806 if let Some(space_pos) = content[..truncate_at].rfind(' ') {
808 if space_pos > target_len / 2 {
809 return space_pos;
810 }
811 }
812
813 truncate_at
815}
816
817#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
828fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
829 if a.len() != b.len() || a.is_empty() {
830 return 0.0;
831 }
832
833 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
834 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
835 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
836
837 if norm_a == 0.0 || norm_b == 0.0 {
838 return 0.0;
839 }
840
841 dot / (norm_a * norm_b)
842}
843
844#[cfg(test)]
849mod tests {
850 use super::*;
851
852 #[test]
853 fn test_analyzer_creation() {
854 let analyzer = SemanticAnalyzer::new();
855 #[cfg(feature = "embeddings")]
858 assert!(analyzer.model_path().is_none());
859 #[cfg(not(feature = "embeddings"))]
860 drop(analyzer); }
862
863 #[test]
864 fn test_analyzer_with_model() {
865 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
866 #[cfg(feature = "embeddings")]
867 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
868 #[cfg(not(feature = "embeddings"))]
869 drop(analyzer); }
871
872 #[test]
873 fn test_compressor_analyzer_access() {
874 let compressor = SemanticCompressor::new();
875 let _analyzer = compressor.analyzer();
877 }
878
879 #[test]
880 fn test_semantic_config_default() {
881 let config = SemanticConfig::default();
882 assert_eq!(config.similarity_threshold, 0.7);
883 assert_eq!(config.budget_ratio, 0.5);
884 }
885
886 #[test]
887 fn test_split_into_chunks() {
888 let compressor = SemanticCompressor::with_config(SemanticConfig {
889 min_chunk_size: 10,
890 max_chunk_size: 1000,
891 ..Default::default()
892 });
893
894 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
895 let chunks = compressor.split_into_chunks(content);
896 assert!(chunks.len() >= 2);
897 }
898
899 #[test]
900 fn test_heuristic_compression() {
901 let compressor = SemanticCompressor::with_config(SemanticConfig {
902 min_chunk_size: 5,
903 max_chunk_size: 100,
904 budget_ratio: 0.5,
905 ..Default::default()
906 });
907
908 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
909 let result = compressor.compress_heuristic(content).unwrap();
910 assert!(!result.is_empty() || content.is_empty());
912 }
913
914 #[test]
915 fn test_empty_content() {
916 let compressor = SemanticCompressor::new();
917 let result = compressor.compress("").unwrap();
918 assert_eq!(result, "");
919 }
920
921 #[test]
922 fn test_cosine_similarity_identical() {
923 let a = vec![1.0, 0.0, 0.0];
924 let b = vec![1.0, 0.0, 0.0];
925 let sim = cosine_similarity(&a, &b);
926 assert!((sim - 1.0).abs() < 0.001);
927 }
928
929 #[test]
930 fn test_cosine_similarity_orthogonal() {
931 let a = vec![1.0, 0.0, 0.0];
932 let c = vec![0.0, 1.0, 0.0];
933 let sim = cosine_similarity(&a, &c);
934 assert!(sim.abs() < 0.001);
935 }
936
937 #[test]
938 fn test_cosine_similarity_empty() {
939 let a: Vec<f32> = vec![];
940 let b: Vec<f32> = vec![];
941 assert_eq!(cosine_similarity(&a, &b), 0.0);
942 }
943
944 #[test]
946 fn test_repetitive_pattern_compression() {
947 let compressor = SemanticCompressor::new();
948 let content = "sentence ".repeat(500);
950 let result = compressor.compress(&content).unwrap();
951
952 assert!(
954 result.len() < content.len() / 2,
955 "Compressed size {} should be less than half of original {}",
956 result.len(),
957 content.len()
958 );
959
960 assert!(result.contains("sentence"));
962 assert!(
963 result.contains("repeated") || result.contains("pattern"),
964 "Should indicate compression occurred"
965 );
966 }
967
968 #[test]
969 fn test_repetitive_line_compression() {
970 let compressor = SemanticCompressor::new();
971 let content = "same line\n".repeat(100);
973 let result = compressor.compress(&content).unwrap();
974
975 assert!(
977 result.len() < content.len() / 2,
978 "Compressed size {} should be less than half of original {}",
979 result.len(),
980 content.len()
981 );
982 }
983
984 #[test]
985 fn test_non_repetitive_content_unchanged() {
986 let compressor = SemanticCompressor::with_config(SemanticConfig {
988 budget_ratio: 1.0,
989 ..Default::default()
990 });
991 let content = "This is some unique content that does not repeat.";
993 let result = compressor.compress(content).unwrap();
994
995 assert_eq!(result, content);
997 }
998
999 #[test]
1000 fn test_repetitive_with_variation() {
1001 let compressor = SemanticCompressor::with_config(SemanticConfig {
1002 budget_ratio: 0.3,
1003 ..Default::default()
1004 });
1005
1006 let mut content = String::new();
1008 for i in 0..50 {
1009 content.push_str(&format!("item {} ", i % 5)); }
1011
1012 let result = compressor.compress(&content).unwrap();
1013 assert!(!result.is_empty());
1016 }
1017
1018 #[test]
1020 fn test_repetitive_unicode_chinese() {
1021 let compressor = SemanticCompressor::new();
1022 let content = "中文测试 ".repeat(100); let result = compressor.compress(&content).unwrap();
1026
1027 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1029
1030 assert!(!result.is_empty() || content.is_empty());
1032 }
1033
1034 #[test]
1035 fn test_repetitive_unicode_emoji() {
1036 let compressor = SemanticCompressor::new();
1037 let content = "🎉🎊🎁 ".repeat(80); let result = compressor.compress(&content).unwrap();
1041 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1042 assert!(!result.is_empty() || content.is_empty());
1043 }
1044
1045 #[test]
1046 fn test_repetitive_unicode_mixed() {
1047 let compressor = SemanticCompressor::new();
1048 let content = "a中🎉 ".repeat(60); let result = compressor.compress(&content).unwrap();
1052 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1053 assert!(!result.is_empty() || content.is_empty());
1054 }
1055
1056 #[test]
1057 fn test_repetitive_unicode_cyrillic() {
1058 let compressor = SemanticCompressor::new();
1059 let content = "Привет ".repeat(50);
1061
1062 let result = compressor.compress(&content).unwrap();
1063 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1064 }
1065
1066 #[test]
1067 fn test_non_repetitive_unicode_boundary() {
1068 let compressor = SemanticCompressor::new();
1069 let content = "世界和平".repeat(60); let result = compressor.compress(&content).unwrap();
1074 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1077 }
1078
1079 #[test]
1080 fn test_repetitive_unicode_line_based() {
1081 let compressor = SemanticCompressor::new();
1082 let content = "中文行\n".repeat(100);
1084
1085 let result = compressor.compress(&content).unwrap();
1086 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1087 }
1088
1089 #[test]
1094 fn test_semantic_error_display() {
1095 let err1 = SemanticError::ModelLoadError("test error".to_owned());
1096 assert!(err1.to_string().contains("Model loading failed"));
1097 assert!(err1.to_string().contains("test error"));
1098
1099 let err2 = SemanticError::EmbeddingError("embed fail".to_owned());
1100 assert!(err2.to_string().contains("Embedding generation failed"));
1101
1102 let err3 = SemanticError::ClusteringError("cluster fail".to_owned());
1103 assert!(err3.to_string().contains("Clustering failed"));
1104
1105 let err4 = SemanticError::FeatureNotEnabled;
1106 assert!(err4.to_string().contains("embeddings feature not enabled"));
1107 }
1108
1109 #[test]
1110 fn test_semantic_error_debug() {
1111 let err = SemanticError::ModelLoadError("debug test".to_owned());
1112 let debug_str = format!("{:?}", err);
1113 assert!(debug_str.contains("ModelLoadError"));
1114 }
1115
1116 #[test]
1117 fn test_semantic_analyzer_default() {
1118 let analyzer = SemanticAnalyzer::default();
1119 let result = analyzer.embed("test");
1121 assert!(result.is_ok());
1122 }
1123
1124 #[test]
1125 fn test_semantic_analyzer_debug() {
1126 let analyzer = SemanticAnalyzer::new();
1127 let debug_str = format!("{:?}", analyzer);
1128 assert!(debug_str.contains("SemanticAnalyzer"));
1129 }
1130
1131 #[test]
1132 fn test_semantic_analyzer_embed_empty() {
1133 let analyzer = SemanticAnalyzer::new();
1134 let result = analyzer.embed("").unwrap();
1135 assert_eq!(result.len(), 384);
1136 }
1137
1138 #[test]
1139 fn test_semantic_analyzer_embed_produces_384_dims() {
1140 let analyzer = SemanticAnalyzer::new();
1141 let result = analyzer.embed("some code content").unwrap();
1142 assert_eq!(result.len(), 384);
1143 }
1144
1145 #[test]
1146 fn test_semantic_analyzer_similarity_same_content() {
1147 let analyzer = SemanticAnalyzer::new();
1148 let result = analyzer.similarity("hello world", "hello world").unwrap();
1149 #[cfg(feature = "embeddings")]
1151 assert!((result - 1.0).abs() < 0.01);
1152 #[cfg(not(feature = "embeddings"))]
1153 assert_eq!(result, 0.0);
1154 }
1155
1156 #[test]
1157 fn test_semantic_analyzer_similarity_different_content() {
1158 let analyzer = SemanticAnalyzer::new();
1159 let result = analyzer.similarity("hello", "goodbye").unwrap();
1160 #[cfg(not(feature = "embeddings"))]
1162 assert_eq!(result, 0.0);
1163 #[cfg(feature = "embeddings")]
1164 assert!((-1.0..=1.0).contains(&result));
1165 }
1166
1167 #[test]
1168 fn test_semantic_config_custom() {
1169 let config = SemanticConfig {
1170 similarity_threshold: 0.9,
1171 min_chunk_size: 50,
1172 max_chunk_size: 5000,
1173 budget_ratio: 0.3,
1174 };
1175 assert_eq!(config.similarity_threshold, 0.9);
1176 assert_eq!(config.min_chunk_size, 50);
1177 assert_eq!(config.max_chunk_size, 5000);
1178 assert_eq!(config.budget_ratio, 0.3);
1179 }
1180
1181 #[test]
1182 fn test_semantic_config_clone() {
1183 let config = SemanticConfig::default();
1184 let cloned = config.clone();
1185 assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1186 assert_eq!(cloned.budget_ratio, config.budget_ratio);
1187 }
1188
1189 #[test]
1190 fn test_semantic_config_debug() {
1191 let config = SemanticConfig::default();
1192 let debug_str = format!("{:?}", config);
1193 assert!(debug_str.contains("SemanticConfig"));
1194 assert!(debug_str.contains("similarity_threshold"));
1195 }
1196
1197 #[test]
1198 fn test_code_chunk_debug() {
1199 let chunk = CodeChunk {
1200 content: "test content".to_owned(),
1201 start: 0,
1202 end: 12,
1203 embedding: None,
1204 cluster_id: None,
1205 };
1206 let debug_str = format!("{:?}", chunk);
1207 assert!(debug_str.contains("CodeChunk"));
1208 assert!(debug_str.contains("test content"));
1209 }
1210
1211 #[test]
1212 fn test_code_chunk_clone() {
1213 let chunk = CodeChunk {
1214 content: "original".to_owned(),
1215 start: 0,
1216 end: 8,
1217 embedding: Some(vec![0.1, 0.2, 0.3]),
1218 cluster_id: Some(5),
1219 };
1220 let cloned = chunk;
1221 assert_eq!(cloned.content, "original");
1222 assert_eq!(cloned.start, 0);
1223 assert_eq!(cloned.end, 8);
1224 assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1225 assert_eq!(cloned.cluster_id, Some(5));
1226 }
1227
1228 #[test]
1229 fn test_semantic_compressor_default() {
1230 let compressor = SemanticCompressor::default();
1231 let result = compressor.compress("test").unwrap();
1232 assert_eq!(result, "test");
1233 }
1234
1235 #[test]
1236 fn test_split_into_chunks_single_newline_fallback() {
1237 let compressor = SemanticCompressor::with_config(SemanticConfig {
1238 min_chunk_size: 5,
1239 max_chunk_size: 1000,
1240 ..Default::default()
1241 });
1242
1243 let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1245 let chunks = compressor.split_into_chunks(content);
1246 assert!(!chunks.is_empty() || content.len() < 5);
1248 }
1249
1250 #[test]
1251 fn test_split_into_chunks_sentence_fallback() {
1252 let compressor = SemanticCompressor::with_config(SemanticConfig {
1253 min_chunk_size: 10,
1254 max_chunk_size: 1000,
1255 ..Default::default()
1256 });
1257
1258 let content = "First sentence here. Second sentence here. Third sentence here.";
1260 let chunks = compressor.split_into_chunks(content);
1261 assert!(!chunks.is_empty() || content.len() < 10);
1263 }
1264
1265 #[test]
1266 fn test_split_into_chunks_force_split() {
1267 let compressor = SemanticCompressor::with_config(SemanticConfig {
1268 min_chunk_size: 100, max_chunk_size: 20, ..Default::default()
1271 });
1272
1273 let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1276 let chunks = compressor.split_into_chunks(content);
1277 assert!(
1279 chunks.len() >= 2,
1280 "Expected at least 2 chunks from force split, got {}",
1281 chunks.len()
1282 );
1283 }
1284
1285 #[test]
1286 fn test_split_into_chunks_empty() {
1287 let compressor = SemanticCompressor::new();
1288 let chunks = compressor.split_into_chunks("");
1289 assert!(chunks.is_empty());
1290 }
1291
1292 #[test]
1293 fn test_split_into_chunks_below_min_size() {
1294 let compressor = SemanticCompressor::with_config(SemanticConfig {
1295 min_chunk_size: 100,
1296 max_chunk_size: 1000,
1297 ..Default::default()
1298 });
1299
1300 let content = "short";
1301 let chunks = compressor.split_into_chunks(content);
1302 assert!(chunks.is_empty());
1304 }
1305
1306 #[test]
1307 fn test_compress_heuristic_empty_chunks() {
1308 let compressor = SemanticCompressor::with_config(SemanticConfig {
1309 min_chunk_size: 1000, budget_ratio: 1.0, ..Default::default()
1312 });
1313
1314 let content = "short content";
1315 let result = compressor.compress_heuristic(content).unwrap();
1316 assert_eq!(result, content);
1318 }
1319
1320 #[test]
1321 fn test_compress_heuristic_multiple_chunks() {
1322 let compressor = SemanticCompressor::with_config(SemanticConfig {
1323 min_chunk_size: 10,
1324 max_chunk_size: 100,
1325 budget_ratio: 0.3,
1326 ..Default::default()
1327 });
1328
1329 let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1330 let result = compressor.compress_heuristic(content).unwrap();
1331 assert!(result.contains("chunk") || result.contains("compressed"));
1333 }
1334
1335 #[test]
1336 fn test_cosine_similarity_different_lengths() {
1337 let a = vec![1.0, 2.0, 3.0];
1338 let b = vec![1.0, 2.0];
1339 let sim = cosine_similarity(&a, &b);
1340 assert_eq!(sim, 0.0); }
1342
1343 #[test]
1344 fn test_cosine_similarity_zero_vectors() {
1345 let a = vec![0.0, 0.0, 0.0];
1346 let b = vec![1.0, 2.0, 3.0];
1347 let sim = cosine_similarity(&a, &b);
1348 assert_eq!(sim, 0.0); }
1350
1351 #[test]
1352 fn test_cosine_similarity_opposite() {
1353 let a = vec![1.0, 0.0, 0.0];
1354 let b = vec![-1.0, 0.0, 0.0];
1355 let sim = cosine_similarity(&a, &b);
1356 assert!((sim + 1.0).abs() < 0.001); }
1358
1359 #[test]
1360 fn test_cosine_similarity_normalized() {
1361 let a = vec![0.6, 0.8, 0.0];
1362 let b = vec![0.6, 0.8, 0.0];
1363 let sim = cosine_similarity(&a, &b);
1364 assert!((sim - 1.0).abs() < 0.001);
1365 }
1366
1367 #[test]
1368 fn test_compress_repetitive_short_content() {
1369 let compressor = SemanticCompressor::new();
1370 let content = "short ".repeat(10); let result = compressor.compress_repetitive(&content);
1373 assert!(result.is_none());
1374 }
1375
1376 #[test]
1377 fn test_compress_repetitive_whitespace_only() {
1378 let compressor = SemanticCompressor::new();
1379 let content = " ".repeat(100);
1381 let result = compressor.compress_repetitive(&content);
1382 assert!(result.is_none());
1384 }
1385
1386 #[test]
1387 fn test_compress_repetitive_low_coverage() {
1388 let compressor = SemanticCompressor::new();
1389 let mut content = "pattern ".repeat(5);
1391 content.push_str(&"x".repeat(200)); let result = compressor.compress_repetitive(&content);
1393 assert!(result.is_none());
1395 }
1396
1397 #[test]
1398 fn test_compress_repetitive_line_low_ratio() {
1399 let compressor = SemanticCompressor::new();
1400 let content = (0..20)
1402 .map(|i| format!("unique line {}", i))
1403 .collect::<Vec<_>>()
1404 .join("\n");
1405 let result = compressor.compress_repetitive(&content);
1406 assert!(result.is_none());
1408 }
1409
1410 #[test]
1411 fn test_compress_repetitive_mixed_with_unique() {
1412 let compressor = SemanticCompressor::new();
1413 let mut lines = vec![];
1415 for i in 0..50 {
1416 if i % 2 == 0 {
1417 lines.push("repeated line");
1418 } else {
1419 lines.push("unique line");
1420 }
1421 }
1422 let content = lines.join("\n");
1423 let result = compressor.compress(&content).unwrap();
1424 assert!(!result.is_empty());
1426 }
1427
1428 #[test]
1429 fn test_compress_no_repetition_returns_none() {
1430 let compressor = SemanticCompressor::new();
1431 let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1433 let result = compressor.compress_repetitive(&content);
1435 drop(result);
1438 }
1439
1440 #[test]
1441 fn test_type_aliases() {
1442 let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1444 let _compressor: HeuristicCompressor = SemanticCompressor::new();
1445 let _config: HeuristicCompressionConfig = SemanticConfig::default();
1446 }
1447
1448 #[test]
1449 fn test_compress_preserves_content_structure() {
1450 let compressor = SemanticCompressor::with_config(SemanticConfig {
1451 min_chunk_size: 10,
1452 max_chunk_size: 500,
1453 budget_ratio: 1.0, ..Default::default()
1455 });
1456
1457 let content = "def foo():\n pass\n\ndef bar():\n pass";
1458 let result = compressor.compress(content).unwrap();
1459 assert!(result.contains("foo") || result.contains("bar"));
1461 }
1462
1463 #[test]
1464 fn test_split_chunks_respects_max_size() {
1465 let compressor = SemanticCompressor::with_config(SemanticConfig {
1466 min_chunk_size: 5,
1467 max_chunk_size: 50,
1468 ..Default::default()
1469 });
1470
1471 let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1472 let chunks = compressor.split_into_chunks(content);
1473
1474 for chunk in &chunks {
1475 assert!(chunk.content.len() <= 50, "Chunk size {} exceeds max 50", chunk.content.len());
1476 }
1477 }
1478
1479 #[test]
1480 fn test_compress_repetitive_with_remainder() {
1481 let compressor = SemanticCompressor::new();
1482 let mut content = "abc ".repeat(100);
1484 content.push_str("xyz"); let result = compressor.compress(&content).unwrap();
1487 assert!(!result.is_empty());
1489 }
1490
1491 #[test]
1492 fn test_compressor_analyzer_method() {
1493 let compressor = SemanticCompressor::new();
1494 let analyzer = compressor.analyzer();
1495
1496 let embed_result = analyzer.embed("test code");
1498 assert!(embed_result.is_ok());
1499 }
1500
1501 #[test]
1502 fn test_code_chunk_with_embedding_and_cluster() {
1503 let chunk = CodeChunk {
1504 content: "fn main() {}".to_owned(),
1505 start: 0,
1506 end: 12,
1507 embedding: Some(vec![0.5; 384]),
1508 cluster_id: Some(3),
1509 };
1510
1511 assert_eq!(chunk.content, "fn main() {}");
1512 assert_eq!(chunk.start, 0);
1513 assert_eq!(chunk.end, 12);
1514 assert!(chunk.embedding.is_some());
1515 assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1516 assert_eq!(chunk.cluster_id, Some(3));
1517 }
1518
1519 #[test]
1520 fn test_compress_very_long_repetitive() {
1521 let compressor = SemanticCompressor::with_config(SemanticConfig {
1522 budget_ratio: 0.2, ..Default::default()
1524 });
1525
1526 let content = "repeated_token ".repeat(1000);
1528 let result = compressor.compress(&content).unwrap();
1529
1530 assert!(result.len() < content.len() / 3);
1532 assert!(result.contains("repeated"));
1533 }
1534
1535 #[test]
1536 fn test_semantic_result_type_ok() {
1537 let result: Result<String> = Ok("success".to_owned());
1538 assert!(result.is_ok());
1539 assert_eq!(result.unwrap(), "success");
1540 }
1541
1542 #[test]
1543 fn test_semantic_result_type_err() {
1544 let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1545 assert!(result.is_err());
1546 }
1547
1548 #[test]
1550 fn test_find_safe_truncation_point_basic() {
1551 let content = "Hello world this is a test";
1552 let point = find_safe_truncation_point(content, 15);
1553 assert!(content.is_char_boundary(point));
1555 assert!(point <= 15 || point == content.len());
1556 }
1557
1558 #[test]
1559 fn test_find_safe_truncation_point_newline() {
1560 let content = "Line one\nLine two\nLine three";
1561 let point = find_safe_truncation_point(content, 20);
1562 assert!(content.is_char_boundary(point));
1564 }
1565
1566 #[test]
1567 fn test_find_safe_truncation_point_unicode() {
1568 let content = "Hello 世界 test";
1569 let point = find_safe_truncation_point(content, 10);
1570 assert!(content.is_char_boundary(point));
1572 }
1573
1574 #[test]
1575 fn test_find_safe_truncation_point_beyond_length() {
1576 let content = "short";
1577 let point = find_safe_truncation_point(content, 100);
1578 assert_eq!(point, content.len());
1579 }
1580
1581 #[test]
1582 fn test_budget_ratio_affects_large_content() {
1583 let content = (0..20)
1586 .map(|i| {
1587 format!("This is paragraph number {} with some content to fill it out nicely.", i)
1588 })
1589 .collect::<Vec<_>>()
1590 .join("\n\n");
1591
1592 let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1594 budget_ratio: 0.3,
1595 min_chunk_size: 20,
1596 max_chunk_size: 2000,
1597 ..Default::default()
1598 });
1599
1600 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1601 budget_ratio: 0.8,
1602 min_chunk_size: 20,
1603 max_chunk_size: 2000,
1604 ..Default::default()
1605 });
1606
1607 let result_30 = compressor_30.compress(&content).unwrap();
1608 let result_80 = compressor_80.compress(&content).unwrap();
1609
1610 assert!(
1612 result_30.len() < result_80.len(),
1613 "30% budget ({}) should be smaller than 80% budget ({})",
1614 result_30.len(),
1615 result_80.len()
1616 );
1617
1618 assert!(
1620 result_30.contains("compressed") || result_30.len() < content.len(),
1621 "30% should show compression indicator"
1622 );
1623 }
1624
1625 #[test]
1626 fn test_budget_ratio_one_returns_original() {
1627 let content = "Some content without chunk boundaries";
1628
1629 let compressor = SemanticCompressor::with_config(SemanticConfig {
1630 budget_ratio: 1.0, ..Default::default()
1632 });
1633
1634 let result = compressor.compress(content).unwrap();
1635 assert_eq!(result, content);
1637 }
1638
1639 #[test]
1647 fn test_budget_ratio_affects_small_content() {
1648 let content = "This is a short test string that should be affected by budget ratio.";
1651
1652 let compressor = SemanticCompressor::with_config(SemanticConfig {
1653 budget_ratio: 0.3, min_chunk_size: 100,
1655 max_chunk_size: 2000,
1656 ..Default::default()
1657 });
1658
1659 let result = compressor.compress(content).unwrap();
1660
1661 assert!(
1663 result.len() < content.len() || result.contains("truncated"),
1664 "Small content with budget_ratio=0.3 should be compressed. Original: {}, Result: {}",
1665 content.len(),
1666 result.len()
1667 );
1668 }
1669
1670 #[test]
1672 fn test_budget_ratio_one_preserves_small_content() {
1673 let content = "Short content that should remain unchanged with budget_ratio=1.0";
1674
1675 let compressor = SemanticCompressor::with_config(SemanticConfig {
1676 budget_ratio: 1.0,
1677 min_chunk_size: 100,
1678 max_chunk_size: 2000,
1679 ..Default::default()
1680 });
1681
1682 let result = compressor.compress(content).unwrap();
1683
1684 assert_eq!(result, content, "budget_ratio=1.0 should preserve content");
1686 }
1687
1688 #[test]
1690 fn test_very_short_content_unchanged() {
1691 let content = "tiny";
1692
1693 let compressor = SemanticCompressor::with_config(SemanticConfig {
1694 budget_ratio: 0.1, ..Default::default()
1696 });
1697
1698 let result = compressor.compress(content).unwrap();
1699
1700 assert_eq!(result, content, "Very short content should be unchanged");
1702 }
1703
1704 #[test]
1706 fn test_budget_ratio_medium_no_chunks() {
1707 let content = "This is a medium length test content that has no paragraph breaks and should trigger the budget ratio truncation path because there are no chunk boundaries.";
1709
1710 let compressor = SemanticCompressor::with_config(SemanticConfig {
1711 budget_ratio: 0.5,
1712 min_chunk_size: 200, max_chunk_size: 2000,
1714 ..Default::default()
1715 });
1716
1717 let result = compressor.compress(content).unwrap();
1718
1719 assert!(
1721 result.len() < content.len(),
1722 "Medium content with budget_ratio=0.5 should be compressed. Original: {}, Result: {}",
1723 content.len(),
1724 result.len()
1725 );
1726 }
1727
1728 #[test]
1730 fn test_truncation_marker_format() {
1731 let content = "A sufficiently long piece of content that will definitely be truncated when we set a low budget ratio.";
1732
1733 let compressor = SemanticCompressor::with_config(SemanticConfig {
1734 budget_ratio: 0.3,
1735 min_chunk_size: 200,
1736 max_chunk_size: 2000,
1737 ..Default::default()
1738 });
1739
1740 let result = compressor.compress(content).unwrap();
1741
1742 if result.contains("truncated") {
1744 assert!(result.contains('%'), "Truncation marker should include percentage");
1745 assert!(result.contains("chars"), "Truncation marker should include char count");
1746 }
1747 }
1748
1749 #[test]
1751 fn test_budget_ratio_proportional() {
1752 let content = "This content is long enough to test different budget ratio values and see that they produce outputs of proportionally different sizes as expected.";
1753
1754 let compressor_20 = SemanticCompressor::with_config(SemanticConfig {
1755 budget_ratio: 0.2,
1756 min_chunk_size: 200,
1757 ..Default::default()
1758 });
1759
1760 let compressor_50 = SemanticCompressor::with_config(SemanticConfig {
1761 budget_ratio: 0.5,
1762 min_chunk_size: 200,
1763 ..Default::default()
1764 });
1765
1766 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1767 budget_ratio: 0.8,
1768 min_chunk_size: 200,
1769 ..Default::default()
1770 });
1771
1772 let result_20 = compressor_20.compress(content).unwrap();
1773 let result_50 = compressor_50.compress(content).unwrap();
1774 let result_80 = compressor_80.compress(content).unwrap();
1775
1776 assert!(
1778 result_20.len() <= result_50.len(),
1779 "20% ratio ({}) should be <= 50% ratio ({})",
1780 result_20.len(),
1781 result_50.len()
1782 );
1783 assert!(
1784 result_50.len() <= result_80.len(),
1785 "50% ratio ({}) should be <= 80% ratio ({})",
1786 result_50.len(),
1787 result_80.len()
1788 );
1789 }
1790}