1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 if let Some(compressed) = self.compress_repetitive(content) {
247 return Ok(compressed);
248 }
249
250 #[cfg(feature = "embeddings")]
251 {
252 return self.compress_with_embeddings(content);
253 }
254
255 #[cfg(not(feature = "embeddings"))]
256 {
257 self.compress_heuristic(content)
258 }
259 }
260
261 fn compress_repetitive(&self, content: &str) -> Option<String> {
268 if content.len() < 200 {
270 return None;
271 }
272
273 for pattern_len in 1..=100.min(content.len() / 3) {
277 if !content.is_char_boundary(pattern_len) {
279 continue;
280 }
281
282 let pattern = &content[..pattern_len];
283
284 if pattern.chars().all(|c| c.is_whitespace()) {
286 continue;
287 }
288
289 let mut count = 0;
291 let mut pos = 0;
292 while pos + pattern_len <= content.len() {
293 if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295 break;
296 }
297 if &content[pos..pos + pattern_len] == pattern {
298 count += 1;
299 pos += pattern_len;
300 } else {
301 break;
302 }
303 }
304
305 let coverage = (count * pattern_len) as f32 / content.len() as f32;
307 if count >= 3 && coverage >= 0.8 {
308 let instances_to_show = (count as f32 * self.config.budget_ratio)
310 .ceil()
311 .clamp(1.0, 5.0) as usize;
312
313 let shown_content = pattern.repeat(instances_to_show);
314 let remainder_start = count * pattern_len;
316 let remainder = if remainder_start <= content.len()
317 && content.is_char_boundary(remainder_start)
318 {
319 &content[remainder_start..]
320 } else {
321 ""
322 };
323
324 let result = if remainder.is_empty() {
325 format!(
326 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
327 shown_content.trim_end(),
328 count,
329 instances_to_show
330 )
331 } else {
332 format!(
333 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
334 shown_content.trim_end(),
335 count,
336 instances_to_show,
337 remainder.trim()
338 )
339 };
340
341 return Some(result);
342 }
343 }
344
345 let lines: Vec<&str> = content.lines().collect();
347 if lines.len() >= 3 {
348 let mut line_counts: std::collections::HashMap<&str, usize> =
349 std::collections::HashMap::new();
350 for line in &lines {
351 *line_counts.entry(*line).or_insert(0) += 1;
352 }
353
354 if let Some((repeated_line, count)) = line_counts
356 .iter()
357 .filter(|(line, _)| !line.trim().is_empty())
358 .max_by_key(|(_, count)| *count)
359 {
360 let repetition_ratio = *count as f32 / lines.len() as f32;
361 if *count >= 3 && repetition_ratio >= 0.5 {
362 let mut result = String::new();
364 let mut consecutive_count = 0;
365 let mut last_was_repeated = false;
366
367 for line in &lines {
368 if *line == *repeated_line {
369 consecutive_count += 1;
370 if !last_was_repeated {
371 if !result.is_empty() {
372 result.push('\n');
373 }
374 result.push_str(line);
375 }
376 last_was_repeated = true;
377 } else {
378 if last_was_repeated && consecutive_count > 1 {
379 result.push_str(&format!(
380 "\n/* ... above line repeated {} times ... */",
381 consecutive_count
382 ));
383 }
384 consecutive_count = 0;
385 last_was_repeated = false;
386 if !result.is_empty() {
387 result.push('\n');
388 }
389 result.push_str(line);
390 }
391 }
392
393 if last_was_repeated && consecutive_count > 1 {
394 result.push_str(&format!(
395 "\n/* ... above line repeated {} times ... */",
396 consecutive_count
397 ));
398 }
399
400 if result.len() < content.len() / 2 {
402 return Some(result);
403 }
404 }
405 }
406 }
407
408 None
409 }
410
411 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
413 let mut chunks = Vec::new();
414 let mut current_start = 0;
415
416 for (i, _) in content.match_indices("\n\n") {
418 if i > current_start && i - current_start >= self.config.min_chunk_size {
419 let chunk_content = &content[current_start..i];
420 if chunk_content.len() <= self.config.max_chunk_size {
421 chunks.push(CodeChunk {
422 content: chunk_content.to_owned(),
423 start: current_start,
424 end: i,
425 embedding: None,
426 cluster_id: None,
427 });
428 }
429 current_start = i + 2;
430 }
431 }
432
433 if current_start < content.len() {
435 let remaining = &content[current_start..];
436 if remaining.len() >= self.config.min_chunk_size {
437 chunks.push(CodeChunk {
438 content: remaining.to_owned(),
439 start: current_start,
440 end: content.len(),
441 embedding: None,
442 cluster_id: None,
443 });
444 }
445 }
446
447 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
449 current_start = 0;
450 for (i, _) in content.match_indices('\n') {
451 if i > current_start && i - current_start >= self.config.min_chunk_size {
452 let chunk_content = &content[current_start..i];
453 if chunk_content.len() <= self.config.max_chunk_size {
454 chunks.push(CodeChunk {
455 content: chunk_content.to_owned(),
456 start: current_start,
457 end: i,
458 embedding: None,
459 cluster_id: None,
460 });
461 }
462 current_start = i + 1;
463 }
464 }
465 if current_start < content.len() {
467 let remaining = &content[current_start..];
468 if remaining.len() >= self.config.min_chunk_size {
469 chunks.push(CodeChunk {
470 content: remaining.to_owned(),
471 start: current_start,
472 end: content.len(),
473 embedding: None,
474 cluster_id: None,
475 });
476 }
477 }
478 }
479
480 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
482 current_start = 0;
483 for (i, _) in content.match_indices(". ") {
484 if i > current_start && i - current_start >= self.config.min_chunk_size {
485 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
487 chunks.push(CodeChunk {
488 content: chunk_content.to_owned(),
489 start: current_start,
490 end: i + 1,
491 embedding: None,
492 cluster_id: None,
493 });
494 }
495 current_start = i + 2;
496 }
497 }
498 if current_start < content.len() {
500 let remaining = &content[current_start..];
501 if remaining.len() >= self.config.min_chunk_size {
502 chunks.push(CodeChunk {
503 content: remaining.to_owned(),
504 start: current_start,
505 end: content.len(),
506 embedding: None,
507 cluster_id: None,
508 });
509 }
510 }
511 }
512
513 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
515 let mut pos = 0;
516 while pos < content.len() {
517 let end = (pos + self.config.max_chunk_size).min(content.len());
518 chunks.push(CodeChunk {
519 content: content[pos..end].to_owned(),
520 start: pos,
521 end,
522 embedding: None,
523 cluster_id: None,
524 });
525 pos = end;
526 }
527 }
528
529 chunks
530 }
531
532 fn compress_heuristic(&self, content: &str) -> Result<String> {
536 let chunks = self.split_into_chunks(content);
537
538 if chunks.is_empty() {
545 if content.len() > self.config.min_chunk_size && self.config.budget_ratio < 1.0 {
548 let target_len = (content.len() as f32 * self.config.budget_ratio) as usize;
549 if target_len > 0 && target_len < content.len() {
550 let truncate_at = find_safe_truncation_point(content, target_len);
552 if truncate_at < content.len() {
553 let truncated = &content[..truncate_at];
554 return Ok(format!(
555 "{}\n/* ... truncated to {:.0}% ({} of {} chars) ... */",
556 truncated.trim_end(),
557 self.config.budget_ratio * 100.0,
558 truncate_at,
559 content.len()
560 ));
561 }
562 }
563 }
564 return Ok(content.to_owned());
565 }
566
567 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
569 let step = chunks.len() / target_chunks.max(1);
570
571 let mut result = String::new();
572 let mut kept = 0;
573
574 for (i, chunk) in chunks.iter().enumerate() {
575 if i % step.max(1) == 0 && kept < target_chunks {
576 if !result.is_empty() {
577 result.push_str("\n\n");
578 }
579 result.push_str(&chunk.content);
580 kept += 1;
581 }
582 }
583
584 if kept < chunks.len() {
586 result.push_str(&format!(
587 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
588 chunks.len() - kept,
589 (kept as f32 / chunks.len() as f32) * 100.0
590 ));
591 }
592
593 Ok(result)
594 }
595
596 #[cfg(feature = "embeddings")]
598 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
599 let mut chunks = self.split_into_chunks(content);
600
601 if chunks.is_empty() {
602 return Ok(content.to_owned());
603 }
604
605 for chunk in &mut chunks {
607 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
608 }
609
610 let clusters = self.cluster_chunks(&chunks)?;
612
613 let mut result = String::new();
615 for cluster in clusters.values() {
616 if let Some(representative) = self.select_representative(cluster) {
617 if !result.is_empty() {
618 result.push_str("\n\n");
619 }
620 result.push_str(&representative.content);
621 }
622 }
623
624 Ok(result)
625 }
626
627 #[cfg(feature = "embeddings")]
629 fn cluster_chunks<'a>(
630 &self,
631 chunks: &'a [CodeChunk],
632 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
633 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
634 let mut next_cluster = 0;
635
636 for chunk in chunks {
637 let embedding = chunk
638 .embedding
639 .as_ref()
640 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
641
642 let mut target_cluster = None;
644 for (&cluster_id, cluster_chunks) in &clusters {
645 if let Some(first) = cluster_chunks.first() {
646 if let Some(ref first_emb) = first.embedding {
647 let similarity = cosine_similarity(embedding, first_emb);
648 if similarity >= self.config.similarity_threshold {
649 target_cluster = Some(cluster_id);
650 break;
651 }
652 }
653 }
654 }
655
656 if let Some(cluster_id) = target_cluster {
657 if let Some(cluster) = clusters.get_mut(&cluster_id) {
658 cluster.push(chunk);
659 }
660 } else {
661 clusters.insert(next_cluster, vec![chunk]);
662 next_cluster += 1;
663 }
664 }
665
666 Ok(clusters)
667 }
668
669 #[cfg(feature = "embeddings")]
671 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
672 chunks.iter().max_by_key(|c| c.content.len()).copied()
674 }
675}
676
677impl Default for SemanticCompressor {
678 fn default() -> Self {
679 Self::new()
680 }
681}
682
683pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
696
697pub type HeuristicCompressor = SemanticCompressor;
703
704pub type HeuristicCompressionConfig = SemanticConfig;
706
707fn find_safe_truncation_point(content: &str, target_len: usize) -> usize {
716 if target_len >= content.len() {
717 return content.len();
718 }
719
720 let mut truncate_at = target_len;
722 while truncate_at > 0 && !content.is_char_boundary(truncate_at) {
723 truncate_at -= 1;
724 }
725
726 if let Some(newline_pos) = content[..truncate_at].rfind('\n') {
728 if newline_pos > target_len / 2 {
729 return newline_pos;
731 }
732 }
733
734 if let Some(space_pos) = content[..truncate_at].rfind(' ') {
736 if space_pos > target_len / 2 {
737 return space_pos;
738 }
739 }
740
741 truncate_at
743}
744
745#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
756fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
757 if a.len() != b.len() || a.is_empty() {
758 return 0.0;
759 }
760
761 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
762 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
763 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
764
765 if norm_a == 0.0 || norm_b == 0.0 {
766 return 0.0;
767 }
768
769 dot / (norm_a * norm_b)
770}
771
772#[cfg(test)]
777mod tests {
778 use super::*;
779
780 #[test]
781 fn test_analyzer_creation() {
782 let analyzer = SemanticAnalyzer::new();
783 #[cfg(feature = "embeddings")]
786 assert!(analyzer.model_path().is_none());
787 #[cfg(not(feature = "embeddings"))]
788 drop(analyzer); }
790
791 #[test]
792 fn test_analyzer_with_model() {
793 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
794 #[cfg(feature = "embeddings")]
795 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
796 #[cfg(not(feature = "embeddings"))]
797 drop(analyzer); }
799
800 #[test]
801 fn test_compressor_analyzer_access() {
802 let compressor = SemanticCompressor::new();
803 let _analyzer = compressor.analyzer();
805 }
806
807 #[test]
808 fn test_semantic_config_default() {
809 let config = SemanticConfig::default();
810 assert_eq!(config.similarity_threshold, 0.7);
811 assert_eq!(config.budget_ratio, 0.5);
812 }
813
814 #[test]
815 fn test_split_into_chunks() {
816 let compressor = SemanticCompressor::with_config(SemanticConfig {
817 min_chunk_size: 10,
818 max_chunk_size: 1000,
819 ..Default::default()
820 });
821
822 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
823 let chunks = compressor.split_into_chunks(content);
824 assert!(chunks.len() >= 2);
825 }
826
827 #[test]
828 fn test_heuristic_compression() {
829 let compressor = SemanticCompressor::with_config(SemanticConfig {
830 min_chunk_size: 5,
831 max_chunk_size: 100,
832 budget_ratio: 0.5,
833 ..Default::default()
834 });
835
836 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
837 let result = compressor.compress_heuristic(content).unwrap();
838 assert!(!result.is_empty() || content.is_empty());
840 }
841
842 #[test]
843 fn test_empty_content() {
844 let compressor = SemanticCompressor::new();
845 let result = compressor.compress("").unwrap();
846 assert_eq!(result, "");
847 }
848
849 #[test]
850 fn test_cosine_similarity_identical() {
851 let a = vec![1.0, 0.0, 0.0];
852 let b = vec![1.0, 0.0, 0.0];
853 let sim = cosine_similarity(&a, &b);
854 assert!((sim - 1.0).abs() < 0.001);
855 }
856
857 #[test]
858 fn test_cosine_similarity_orthogonal() {
859 let a = vec![1.0, 0.0, 0.0];
860 let c = vec![0.0, 1.0, 0.0];
861 let sim = cosine_similarity(&a, &c);
862 assert!(sim.abs() < 0.001);
863 }
864
865 #[test]
866 fn test_cosine_similarity_empty() {
867 let a: Vec<f32> = vec![];
868 let b: Vec<f32> = vec![];
869 assert_eq!(cosine_similarity(&a, &b), 0.0);
870 }
871
872 #[test]
874 fn test_repetitive_pattern_compression() {
875 let compressor = SemanticCompressor::new();
876 let content = "sentence ".repeat(500);
878 let result = compressor.compress(&content).unwrap();
879
880 assert!(
882 result.len() < content.len() / 2,
883 "Compressed size {} should be less than half of original {}",
884 result.len(),
885 content.len()
886 );
887
888 assert!(result.contains("sentence"));
890 assert!(
891 result.contains("repeated") || result.contains("pattern"),
892 "Should indicate compression occurred"
893 );
894 }
895
896 #[test]
897 fn test_repetitive_line_compression() {
898 let compressor = SemanticCompressor::new();
899 let content = "same line\n".repeat(100);
901 let result = compressor.compress(&content).unwrap();
902
903 assert!(
905 result.len() < content.len() / 2,
906 "Compressed size {} should be less than half of original {}",
907 result.len(),
908 content.len()
909 );
910 }
911
912 #[test]
913 fn test_non_repetitive_content_unchanged() {
914 let compressor = SemanticCompressor::new();
915 let content = "This is some unique content that does not repeat.";
917 let result = compressor.compress(content).unwrap();
918
919 assert_eq!(result, content);
921 }
922
923 #[test]
924 fn test_repetitive_with_variation() {
925 let compressor = SemanticCompressor::with_config(SemanticConfig {
926 budget_ratio: 0.3,
927 ..Default::default()
928 });
929
930 let mut content = String::new();
932 for i in 0..50 {
933 content.push_str(&format!("item {} ", i % 5)); }
935
936 let result = compressor.compress(&content).unwrap();
937 assert!(!result.is_empty());
940 }
941
942 #[test]
944 fn test_repetitive_unicode_chinese() {
945 let compressor = SemanticCompressor::new();
946 let content = "ไธญๆๆต่ฏ ".repeat(100); let result = compressor.compress(&content).unwrap();
950
951 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
953
954 assert!(!result.is_empty() || content.is_empty());
956 }
957
958 #[test]
959 fn test_repetitive_unicode_emoji() {
960 let compressor = SemanticCompressor::new();
961 let content = "๐๐๐ ".repeat(80); let result = compressor.compress(&content).unwrap();
965 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
966 assert!(!result.is_empty() || content.is_empty());
967 }
968
969 #[test]
970 fn test_repetitive_unicode_mixed() {
971 let compressor = SemanticCompressor::new();
972 let content = "aไธญ๐ ".repeat(60); let result = compressor.compress(&content).unwrap();
976 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
977 assert!(!result.is_empty() || content.is_empty());
978 }
979
980 #[test]
981 fn test_repetitive_unicode_cyrillic() {
982 let compressor = SemanticCompressor::new();
983 let content = "ะัะธะฒะตั ".repeat(50);
985
986 let result = compressor.compress(&content).unwrap();
987 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
988 }
989
990 #[test]
991 fn test_non_repetitive_unicode_boundary() {
992 let compressor = SemanticCompressor::new();
993 let content = "ไธ็ๅๅนณ".repeat(60); let result = compressor.compress(&content).unwrap();
998 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1001 }
1002
1003 #[test]
1004 fn test_repetitive_unicode_line_based() {
1005 let compressor = SemanticCompressor::new();
1006 let content = "ไธญๆ่ก\n".repeat(100);
1008
1009 let result = compressor.compress(&content).unwrap();
1010 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
1011 }
1012
1013 #[test]
1018 fn test_semantic_error_display() {
1019 let err1 = SemanticError::ModelLoadError("test error".to_string());
1020 assert!(err1.to_string().contains("Model loading failed"));
1021 assert!(err1.to_string().contains("test error"));
1022
1023 let err2 = SemanticError::EmbeddingError("embed fail".to_string());
1024 assert!(err2.to_string().contains("Embedding generation failed"));
1025
1026 let err3 = SemanticError::ClusteringError("cluster fail".to_string());
1027 assert!(err3.to_string().contains("Clustering failed"));
1028
1029 let err4 = SemanticError::FeatureNotEnabled;
1030 assert!(err4.to_string().contains("embeddings feature not enabled"));
1031 }
1032
1033 #[test]
1034 fn test_semantic_error_debug() {
1035 let err = SemanticError::ModelLoadError("debug test".to_string());
1036 let debug_str = format!("{:?}", err);
1037 assert!(debug_str.contains("ModelLoadError"));
1038 }
1039
1040 #[test]
1041 fn test_semantic_analyzer_default() {
1042 let analyzer = SemanticAnalyzer::default();
1043 let result = analyzer.embed("test");
1045 assert!(result.is_ok());
1046 }
1047
1048 #[test]
1049 fn test_semantic_analyzer_debug() {
1050 let analyzer = SemanticAnalyzer::new();
1051 let debug_str = format!("{:?}", analyzer);
1052 assert!(debug_str.contains("SemanticAnalyzer"));
1053 }
1054
1055 #[test]
1056 fn test_semantic_analyzer_embed_empty() {
1057 let analyzer = SemanticAnalyzer::new();
1058 let result = analyzer.embed("").unwrap();
1059 assert_eq!(result.len(), 384);
1060 }
1061
1062 #[test]
1063 fn test_semantic_analyzer_embed_produces_384_dims() {
1064 let analyzer = SemanticAnalyzer::new();
1065 let result = analyzer.embed("some code content").unwrap();
1066 assert_eq!(result.len(), 384);
1067 }
1068
1069 #[test]
1070 fn test_semantic_analyzer_similarity_same_content() {
1071 let analyzer = SemanticAnalyzer::new();
1072 let result = analyzer.similarity("hello world", "hello world").unwrap();
1073 #[cfg(feature = "embeddings")]
1075 assert!((result - 1.0).abs() < 0.01);
1076 #[cfg(not(feature = "embeddings"))]
1077 assert_eq!(result, 0.0);
1078 }
1079
1080 #[test]
1081 fn test_semantic_analyzer_similarity_different_content() {
1082 let analyzer = SemanticAnalyzer::new();
1083 let result = analyzer.similarity("hello", "goodbye").unwrap();
1084 #[cfg(not(feature = "embeddings"))]
1086 assert_eq!(result, 0.0);
1087 #[cfg(feature = "embeddings")]
1088 assert!(result >= -1.0 && result <= 1.0);
1089 }
1090
1091 #[test]
1092 fn test_semantic_config_custom() {
1093 let config = SemanticConfig {
1094 similarity_threshold: 0.9,
1095 min_chunk_size: 50,
1096 max_chunk_size: 5000,
1097 budget_ratio: 0.3,
1098 };
1099 assert_eq!(config.similarity_threshold, 0.9);
1100 assert_eq!(config.min_chunk_size, 50);
1101 assert_eq!(config.max_chunk_size, 5000);
1102 assert_eq!(config.budget_ratio, 0.3);
1103 }
1104
1105 #[test]
1106 fn test_semantic_config_clone() {
1107 let config = SemanticConfig::default();
1108 let cloned = config.clone();
1109 assert_eq!(cloned.similarity_threshold, config.similarity_threshold);
1110 assert_eq!(cloned.budget_ratio, config.budget_ratio);
1111 }
1112
1113 #[test]
1114 fn test_semantic_config_debug() {
1115 let config = SemanticConfig::default();
1116 let debug_str = format!("{:?}", config);
1117 assert!(debug_str.contains("SemanticConfig"));
1118 assert!(debug_str.contains("similarity_threshold"));
1119 }
1120
1121 #[test]
1122 fn test_code_chunk_debug() {
1123 let chunk = CodeChunk {
1124 content: "test content".to_string(),
1125 start: 0,
1126 end: 12,
1127 embedding: None,
1128 cluster_id: None,
1129 };
1130 let debug_str = format!("{:?}", chunk);
1131 assert!(debug_str.contains("CodeChunk"));
1132 assert!(debug_str.contains("test content"));
1133 }
1134
1135 #[test]
1136 fn test_code_chunk_clone() {
1137 let chunk = CodeChunk {
1138 content: "original".to_string(),
1139 start: 0,
1140 end: 8,
1141 embedding: Some(vec![0.1, 0.2, 0.3]),
1142 cluster_id: Some(5),
1143 };
1144 let cloned = chunk.clone();
1145 assert_eq!(cloned.content, "original");
1146 assert_eq!(cloned.start, 0);
1147 assert_eq!(cloned.end, 8);
1148 assert_eq!(cloned.embedding, Some(vec![0.1, 0.2, 0.3]));
1149 assert_eq!(cloned.cluster_id, Some(5));
1150 }
1151
1152 #[test]
1153 fn test_semantic_compressor_default() {
1154 let compressor = SemanticCompressor::default();
1155 let result = compressor.compress("test").unwrap();
1156 assert_eq!(result, "test");
1157 }
1158
1159 #[test]
1160 fn test_split_into_chunks_single_newline_fallback() {
1161 let compressor = SemanticCompressor::with_config(SemanticConfig {
1162 min_chunk_size: 5,
1163 max_chunk_size: 1000,
1164 ..Default::default()
1165 });
1166
1167 let content = "Line 1 with content\nLine 2 with content\nLine 3 with content";
1169 let chunks = compressor.split_into_chunks(content);
1170 assert!(!chunks.is_empty() || content.len() < 5);
1172 }
1173
1174 #[test]
1175 fn test_split_into_chunks_sentence_fallback() {
1176 let compressor = SemanticCompressor::with_config(SemanticConfig {
1177 min_chunk_size: 10,
1178 max_chunk_size: 1000,
1179 ..Default::default()
1180 });
1181
1182 let content = "First sentence here. Second sentence here. Third sentence here.";
1184 let chunks = compressor.split_into_chunks(content);
1185 assert!(!chunks.is_empty() || content.len() < 10);
1187 }
1188
1189 #[test]
1190 fn test_split_into_chunks_force_split() {
1191 let compressor = SemanticCompressor::with_config(SemanticConfig {
1192 min_chunk_size: 100, max_chunk_size: 20, ..Default::default()
1195 });
1196
1197 let content = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ";
1200 let chunks = compressor.split_into_chunks(content);
1201 assert!(chunks.len() >= 2, "Expected at least 2 chunks from force split, got {}", chunks.len());
1203 }
1204
1205 #[test]
1206 fn test_split_into_chunks_empty() {
1207 let compressor = SemanticCompressor::new();
1208 let chunks = compressor.split_into_chunks("");
1209 assert!(chunks.is_empty());
1210 }
1211
1212 #[test]
1213 fn test_split_into_chunks_below_min_size() {
1214 let compressor = SemanticCompressor::with_config(SemanticConfig {
1215 min_chunk_size: 100,
1216 max_chunk_size: 1000,
1217 ..Default::default()
1218 });
1219
1220 let content = "short";
1221 let chunks = compressor.split_into_chunks(content);
1222 assert!(chunks.is_empty());
1224 }
1225
1226 #[test]
1227 fn test_compress_heuristic_empty_chunks() {
1228 let compressor = SemanticCompressor::with_config(SemanticConfig {
1229 min_chunk_size: 1000, ..Default::default()
1231 });
1232
1233 let content = "short content";
1234 let result = compressor.compress_heuristic(content).unwrap();
1235 assert_eq!(result, content);
1237 }
1238
1239 #[test]
1240 fn test_compress_heuristic_multiple_chunks() {
1241 let compressor = SemanticCompressor::with_config(SemanticConfig {
1242 min_chunk_size: 10,
1243 max_chunk_size: 100,
1244 budget_ratio: 0.3,
1245 ..Default::default()
1246 });
1247
1248 let content = "First chunk content here\n\nSecond chunk content here\n\nThird chunk content here\n\nFourth chunk content";
1249 let result = compressor.compress_heuristic(content).unwrap();
1250 assert!(result.contains("chunk") || result.contains("compressed"));
1252 }
1253
1254 #[test]
1255 fn test_cosine_similarity_different_lengths() {
1256 let a = vec![1.0, 2.0, 3.0];
1257 let b = vec![1.0, 2.0];
1258 let sim = cosine_similarity(&a, &b);
1259 assert_eq!(sim, 0.0); }
1261
1262 #[test]
1263 fn test_cosine_similarity_zero_vectors() {
1264 let a = vec![0.0, 0.0, 0.0];
1265 let b = vec![1.0, 2.0, 3.0];
1266 let sim = cosine_similarity(&a, &b);
1267 assert_eq!(sim, 0.0); }
1269
1270 #[test]
1271 fn test_cosine_similarity_opposite() {
1272 let a = vec![1.0, 0.0, 0.0];
1273 let b = vec![-1.0, 0.0, 0.0];
1274 let sim = cosine_similarity(&a, &b);
1275 assert!((sim + 1.0).abs() < 0.001); }
1277
1278 #[test]
1279 fn test_cosine_similarity_normalized() {
1280 let a = vec![0.6, 0.8, 0.0];
1281 let b = vec![0.6, 0.8, 0.0];
1282 let sim = cosine_similarity(&a, &b);
1283 assert!((sim - 1.0).abs() < 0.001);
1284 }
1285
1286 #[test]
1287 fn test_compress_repetitive_short_content() {
1288 let compressor = SemanticCompressor::new();
1289 let content = "short ".repeat(10); let result = compressor.compress_repetitive(&content);
1292 assert!(result.is_none());
1293 }
1294
1295 #[test]
1296 fn test_compress_repetitive_whitespace_only() {
1297 let compressor = SemanticCompressor::new();
1298 let content = " ".repeat(100);
1300 let result = compressor.compress_repetitive(&content);
1301 assert!(result.is_none());
1303 }
1304
1305 #[test]
1306 fn test_compress_repetitive_low_coverage() {
1307 let compressor = SemanticCompressor::new();
1308 let mut content = "pattern ".repeat(5);
1310 content.push_str(&"x".repeat(200)); let result = compressor.compress_repetitive(&content);
1312 assert!(result.is_none());
1314 }
1315
1316 #[test]
1317 fn test_compress_repetitive_line_low_ratio() {
1318 let compressor = SemanticCompressor::new();
1319 let content = (0..20).map(|i| format!("unique line {}", i)).collect::<Vec<_>>().join("\n");
1321 let result = compressor.compress_repetitive(&content);
1322 assert!(result.is_none());
1324 }
1325
1326 #[test]
1327 fn test_compress_repetitive_mixed_with_unique() {
1328 let compressor = SemanticCompressor::new();
1329 let mut lines = vec![];
1331 for i in 0..50 {
1332 if i % 2 == 0 {
1333 lines.push("repeated line");
1334 } else {
1335 lines.push("unique line");
1336 }
1337 }
1338 let content = lines.join("\n");
1339 let result = compressor.compress(&content).unwrap();
1340 assert!(!result.is_empty());
1342 }
1343
1344 #[test]
1345 fn test_compress_no_repetition_returns_none() {
1346 let compressor = SemanticCompressor::new();
1347 let content = "The quick brown fox jumps over the lazy dog. ".repeat(5);
1349 let result = compressor.compress_repetitive(&content);
1351 drop(result);
1354 }
1355
1356 #[test]
1357 fn test_type_aliases() {
1358 let _analyzer: CharacterFrequencyAnalyzer = SemanticAnalyzer::new();
1360 let _compressor: HeuristicCompressor = SemanticCompressor::new();
1361 let _config: HeuristicCompressionConfig = SemanticConfig::default();
1362 }
1363
1364 #[test]
1365 fn test_compress_preserves_content_structure() {
1366 let compressor = SemanticCompressor::with_config(SemanticConfig {
1367 min_chunk_size: 10,
1368 max_chunk_size: 500,
1369 budget_ratio: 1.0, ..Default::default()
1371 });
1372
1373 let content = "def foo():\n pass\n\ndef bar():\n pass";
1374 let result = compressor.compress(content).unwrap();
1375 assert!(result.contains("foo") || result.contains("bar"));
1377 }
1378
1379 #[test]
1380 fn test_split_chunks_respects_max_size() {
1381 let compressor = SemanticCompressor::with_config(SemanticConfig {
1382 min_chunk_size: 5,
1383 max_chunk_size: 50,
1384 ..Default::default()
1385 });
1386
1387 let content = "A very long chunk that exceeds the max size limit\n\nAnother chunk";
1388 let chunks = compressor.split_into_chunks(content);
1389
1390 for chunk in &chunks {
1391 assert!(
1392 chunk.content.len() <= 50,
1393 "Chunk size {} exceeds max 50",
1394 chunk.content.len()
1395 );
1396 }
1397 }
1398
1399 #[test]
1400 fn test_compress_repetitive_with_remainder() {
1401 let compressor = SemanticCompressor::new();
1402 let mut content = "abc ".repeat(100);
1404 content.push_str("xyz"); let result = compressor.compress(&content).unwrap();
1407 assert!(!result.is_empty());
1409 }
1410
1411 #[test]
1412 fn test_compressor_analyzer_method() {
1413 let compressor = SemanticCompressor::new();
1414 let analyzer = compressor.analyzer();
1415
1416 let embed_result = analyzer.embed("test code");
1418 assert!(embed_result.is_ok());
1419 }
1420
1421 #[test]
1422 fn test_code_chunk_with_embedding_and_cluster() {
1423 let chunk = CodeChunk {
1424 content: "fn main() {}".to_string(),
1425 start: 0,
1426 end: 12,
1427 embedding: Some(vec![0.5; 384]),
1428 cluster_id: Some(3),
1429 };
1430
1431 assert_eq!(chunk.content, "fn main() {}");
1432 assert_eq!(chunk.start, 0);
1433 assert_eq!(chunk.end, 12);
1434 assert!(chunk.embedding.is_some());
1435 assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384);
1436 assert_eq!(chunk.cluster_id, Some(3));
1437 }
1438
1439 #[test]
1440 fn test_compress_very_long_repetitive() {
1441 let compressor = SemanticCompressor::with_config(SemanticConfig {
1442 budget_ratio: 0.2, ..Default::default()
1444 });
1445
1446 let content = "repeated_token ".repeat(1000);
1448 let result = compressor.compress(&content).unwrap();
1449
1450 assert!(result.len() < content.len() / 3);
1452 assert!(result.contains("repeated"));
1453 }
1454
1455 #[test]
1456 fn test_semantic_result_type_ok() {
1457 let result: Result<String> = Ok("success".to_string());
1458 assert!(result.is_ok());
1459 assert_eq!(result.unwrap(), "success");
1460 }
1461
1462 #[test]
1463 fn test_semantic_result_type_err() {
1464 let result: Result<String> = Err(SemanticError::FeatureNotEnabled);
1465 assert!(result.is_err());
1466 }
1467
1468 #[test]
1470 fn test_find_safe_truncation_point_basic() {
1471 let content = "Hello world this is a test";
1472 let point = find_safe_truncation_point(content, 15);
1473 assert!(content.is_char_boundary(point));
1475 assert!(point <= 15 || point == content.len());
1476 }
1477
1478 #[test]
1479 fn test_find_safe_truncation_point_newline() {
1480 let content = "Line one\nLine two\nLine three";
1481 let point = find_safe_truncation_point(content, 20);
1482 assert!(content.is_char_boundary(point));
1484 }
1485
1486 #[test]
1487 fn test_find_safe_truncation_point_unicode() {
1488 let content = "Hello ไธ็ test";
1489 let point = find_safe_truncation_point(content, 10);
1490 assert!(content.is_char_boundary(point));
1492 }
1493
1494 #[test]
1495 fn test_find_safe_truncation_point_beyond_length() {
1496 let content = "short";
1497 let point = find_safe_truncation_point(content, 100);
1498 assert_eq!(point, content.len());
1499 }
1500
1501 #[test]
1502 fn test_budget_ratio_affects_large_content() {
1503 let content = (0..20)
1506 .map(|i| format!("This is paragraph number {} with some content to fill it out nicely.", i))
1507 .collect::<Vec<_>>()
1508 .join("\n\n");
1509
1510 let compressor_30 = SemanticCompressor::with_config(SemanticConfig {
1512 budget_ratio: 0.3,
1513 min_chunk_size: 20,
1514 max_chunk_size: 2000,
1515 ..Default::default()
1516 });
1517
1518 let compressor_80 = SemanticCompressor::with_config(SemanticConfig {
1519 budget_ratio: 0.8,
1520 min_chunk_size: 20,
1521 max_chunk_size: 2000,
1522 ..Default::default()
1523 });
1524
1525 let result_30 = compressor_30.compress(&content).unwrap();
1526 let result_80 = compressor_80.compress(&content).unwrap();
1527
1528 assert!(
1530 result_30.len() < result_80.len(),
1531 "30% budget ({}) should be smaller than 80% budget ({})",
1532 result_30.len(),
1533 result_80.len()
1534 );
1535
1536 assert!(
1538 result_30.contains("compressed") || result_30.len() < content.len(),
1539 "30% should show compression indicator"
1540 );
1541 }
1542
1543 #[test]
1544 fn test_budget_ratio_one_returns_original() {
1545 let content = "Some content without chunk boundaries";
1546
1547 let compressor = SemanticCompressor::with_config(SemanticConfig {
1548 budget_ratio: 1.0, ..Default::default()
1550 });
1551
1552 let result = compressor.compress(content).unwrap();
1553 assert_eq!(result, content);
1555 }
1556}