1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 if let Some(compressed) = self.compress_repetitive(content) {
247 return Ok(compressed);
248 }
249
250 #[cfg(feature = "embeddings")]
251 {
252 return self.compress_with_embeddings(content);
253 }
254
255 #[cfg(not(feature = "embeddings"))]
256 {
257 self.compress_heuristic(content)
258 }
259 }
260
261 fn compress_repetitive(&self, content: &str) -> Option<String> {
268 if content.len() < 200 {
270 return None;
271 }
272
273 for pattern_len in 1..=100.min(content.len() / 3) {
277 if !content.is_char_boundary(pattern_len) {
279 continue;
280 }
281
282 let pattern = &content[..pattern_len];
283
284 if pattern.chars().all(|c| c.is_whitespace()) {
286 continue;
287 }
288
289 let mut count = 0;
291 let mut pos = 0;
292 while pos + pattern_len <= content.len() {
293 if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295 break;
296 }
297 if &content[pos..pos + pattern_len] == pattern {
298 count += 1;
299 pos += pattern_len;
300 } else {
301 break;
302 }
303 }
304
305 let coverage = (count * pattern_len) as f32 / content.len() as f32;
307 if count >= 3 && coverage >= 0.8 {
308 let instances_to_show = (count as f32 * self.config.budget_ratio)
310 .ceil()
311 .clamp(1.0, 5.0) as usize;
312
313 let shown_content = pattern.repeat(instances_to_show);
314 let remainder_start = count * pattern_len;
316 let remainder = if remainder_start <= content.len()
317 && content.is_char_boundary(remainder_start)
318 {
319 &content[remainder_start..]
320 } else {
321 ""
322 };
323
324 let result = if remainder.is_empty() {
325 format!(
326 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
327 shown_content.trim_end(),
328 count,
329 instances_to_show
330 )
331 } else {
332 format!(
333 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
334 shown_content.trim_end(),
335 count,
336 instances_to_show,
337 remainder.trim()
338 )
339 };
340
341 return Some(result);
342 }
343 }
344
345 let lines: Vec<&str> = content.lines().collect();
347 if lines.len() >= 3 {
348 let mut line_counts: std::collections::HashMap<&str, usize> =
349 std::collections::HashMap::new();
350 for line in &lines {
351 *line_counts.entry(*line).or_insert(0) += 1;
352 }
353
354 if let Some((repeated_line, count)) = line_counts
356 .iter()
357 .filter(|(line, _)| !line.trim().is_empty())
358 .max_by_key(|(_, count)| *count)
359 {
360 let repetition_ratio = *count as f32 / lines.len() as f32;
361 if *count >= 3 && repetition_ratio >= 0.5 {
362 let mut result = String::new();
364 let mut consecutive_count = 0;
365 let mut last_was_repeated = false;
366
367 for line in &lines {
368 if *line == *repeated_line {
369 consecutive_count += 1;
370 if !last_was_repeated {
371 if !result.is_empty() {
372 result.push('\n');
373 }
374 result.push_str(line);
375 }
376 last_was_repeated = true;
377 } else {
378 if last_was_repeated && consecutive_count > 1 {
379 result.push_str(&format!(
380 "\n/* ... above line repeated {} times ... */",
381 consecutive_count
382 ));
383 }
384 consecutive_count = 0;
385 last_was_repeated = false;
386 if !result.is_empty() {
387 result.push('\n');
388 }
389 result.push_str(line);
390 }
391 }
392
393 if last_was_repeated && consecutive_count > 1 {
394 result.push_str(&format!(
395 "\n/* ... above line repeated {} times ... */",
396 consecutive_count
397 ));
398 }
399
400 if result.len() < content.len() / 2 {
402 return Some(result);
403 }
404 }
405 }
406 }
407
408 None
409 }
410
411 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
413 let mut chunks = Vec::new();
414 let mut current_start = 0;
415
416 for (i, _) in content.match_indices("\n\n") {
418 if i > current_start && i - current_start >= self.config.min_chunk_size {
419 let chunk_content = &content[current_start..i];
420 if chunk_content.len() <= self.config.max_chunk_size {
421 chunks.push(CodeChunk {
422 content: chunk_content.to_owned(),
423 start: current_start,
424 end: i,
425 embedding: None,
426 cluster_id: None,
427 });
428 }
429 current_start = i + 2;
430 }
431 }
432
433 if current_start < content.len() {
435 let remaining = &content[current_start..];
436 if remaining.len() >= self.config.min_chunk_size {
437 chunks.push(CodeChunk {
438 content: remaining.to_owned(),
439 start: current_start,
440 end: content.len(),
441 embedding: None,
442 cluster_id: None,
443 });
444 }
445 }
446
447 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
449 current_start = 0;
450 for (i, _) in content.match_indices('\n') {
451 if i > current_start && i - current_start >= self.config.min_chunk_size {
452 let chunk_content = &content[current_start..i];
453 if chunk_content.len() <= self.config.max_chunk_size {
454 chunks.push(CodeChunk {
455 content: chunk_content.to_owned(),
456 start: current_start,
457 end: i,
458 embedding: None,
459 cluster_id: None,
460 });
461 }
462 current_start = i + 1;
463 }
464 }
465 if current_start < content.len() {
467 let remaining = &content[current_start..];
468 if remaining.len() >= self.config.min_chunk_size {
469 chunks.push(CodeChunk {
470 content: remaining.to_owned(),
471 start: current_start,
472 end: content.len(),
473 embedding: None,
474 cluster_id: None,
475 });
476 }
477 }
478 }
479
480 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
482 current_start = 0;
483 for (i, _) in content.match_indices(". ") {
484 if i > current_start && i - current_start >= self.config.min_chunk_size {
485 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
487 chunks.push(CodeChunk {
488 content: chunk_content.to_owned(),
489 start: current_start,
490 end: i + 1,
491 embedding: None,
492 cluster_id: None,
493 });
494 }
495 current_start = i + 2;
496 }
497 }
498 if current_start < content.len() {
500 let remaining = &content[current_start..];
501 if remaining.len() >= self.config.min_chunk_size {
502 chunks.push(CodeChunk {
503 content: remaining.to_owned(),
504 start: current_start,
505 end: content.len(),
506 embedding: None,
507 cluster_id: None,
508 });
509 }
510 }
511 }
512
513 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
515 let mut pos = 0;
516 while pos < content.len() {
517 let end = (pos + self.config.max_chunk_size).min(content.len());
518 chunks.push(CodeChunk {
519 content: content[pos..end].to_owned(),
520 start: pos,
521 end,
522 embedding: None,
523 cluster_id: None,
524 });
525 pos = end;
526 }
527 }
528
529 chunks
530 }
531
532 fn compress_heuristic(&self, content: &str) -> Result<String> {
534 let chunks = self.split_into_chunks(content);
535
536 if chunks.is_empty() {
537 return Ok(content.to_owned());
538 }
539
540 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
542 let step = chunks.len() / target_chunks.max(1);
543
544 let mut result = String::new();
545 let mut kept = 0;
546
547 for (i, chunk) in chunks.iter().enumerate() {
548 if i % step.max(1) == 0 && kept < target_chunks {
549 if !result.is_empty() {
550 result.push_str("\n\n");
551 }
552 result.push_str(&chunk.content);
553 kept += 1;
554 }
555 }
556
557 if kept < chunks.len() {
559 result.push_str(&format!(
560 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
561 chunks.len() - kept,
562 (kept as f32 / chunks.len() as f32) * 100.0
563 ));
564 }
565
566 Ok(result)
567 }
568
569 #[cfg(feature = "embeddings")]
571 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
572 let mut chunks = self.split_into_chunks(content);
573
574 if chunks.is_empty() {
575 return Ok(content.to_owned());
576 }
577
578 for chunk in &mut chunks {
580 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
581 }
582
583 let clusters = self.cluster_chunks(&chunks)?;
585
586 let mut result = String::new();
588 for cluster in clusters.values() {
589 if let Some(representative) = self.select_representative(cluster) {
590 if !result.is_empty() {
591 result.push_str("\n\n");
592 }
593 result.push_str(&representative.content);
594 }
595 }
596
597 Ok(result)
598 }
599
600 #[cfg(feature = "embeddings")]
602 fn cluster_chunks<'a>(
603 &self,
604 chunks: &'a [CodeChunk],
605 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
606 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
607 let mut next_cluster = 0;
608
609 for chunk in chunks {
610 let embedding = chunk
611 .embedding
612 .as_ref()
613 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
614
615 let mut target_cluster = None;
617 for (&cluster_id, cluster_chunks) in &clusters {
618 if let Some(first) = cluster_chunks.first() {
619 if let Some(ref first_emb) = first.embedding {
620 let similarity = cosine_similarity(embedding, first_emb);
621 if similarity >= self.config.similarity_threshold {
622 target_cluster = Some(cluster_id);
623 break;
624 }
625 }
626 }
627 }
628
629 if let Some(cluster_id) = target_cluster {
630 if let Some(cluster) = clusters.get_mut(&cluster_id) {
631 cluster.push(chunk);
632 }
633 } else {
634 clusters.insert(next_cluster, vec![chunk]);
635 next_cluster += 1;
636 }
637 }
638
639 Ok(clusters)
640 }
641
642 #[cfg(feature = "embeddings")]
644 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
645 chunks.iter().max_by_key(|c| c.content.len()).copied()
647 }
648}
649
650impl Default for SemanticCompressor {
651 fn default() -> Self {
652 Self::new()
653 }
654}
655
656pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
669
670pub type HeuristicCompressor = SemanticCompressor;
676
677pub type HeuristicCompressionConfig = SemanticConfig;
679
680#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
695fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
696 if a.len() != b.len() || a.is_empty() {
697 return 0.0;
698 }
699
700 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
701 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
702 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
703
704 if norm_a == 0.0 || norm_b == 0.0 {
705 return 0.0;
706 }
707
708 dot / (norm_a * norm_b)
709}
710
711#[cfg(test)]
716mod tests {
717 use super::*;
718
719 #[test]
720 fn test_analyzer_creation() {
721 let analyzer = SemanticAnalyzer::new();
722 #[cfg(feature = "embeddings")]
725 assert!(analyzer.model_path().is_none());
726 #[cfg(not(feature = "embeddings"))]
727 drop(analyzer); }
729
730 #[test]
731 fn test_analyzer_with_model() {
732 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
733 #[cfg(feature = "embeddings")]
734 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
735 #[cfg(not(feature = "embeddings"))]
736 drop(analyzer); }
738
739 #[test]
740 fn test_compressor_analyzer_access() {
741 let compressor = SemanticCompressor::new();
742 let _analyzer = compressor.analyzer();
744 }
745
746 #[test]
747 fn test_semantic_config_default() {
748 let config = SemanticConfig::default();
749 assert_eq!(config.similarity_threshold, 0.7);
750 assert_eq!(config.budget_ratio, 0.5);
751 }
752
753 #[test]
754 fn test_split_into_chunks() {
755 let compressor = SemanticCompressor::with_config(SemanticConfig {
756 min_chunk_size: 10,
757 max_chunk_size: 1000,
758 ..Default::default()
759 });
760
761 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
762 let chunks = compressor.split_into_chunks(content);
763 assert!(chunks.len() >= 2);
764 }
765
766 #[test]
767 fn test_heuristic_compression() {
768 let compressor = SemanticCompressor::with_config(SemanticConfig {
769 min_chunk_size: 5,
770 max_chunk_size: 100,
771 budget_ratio: 0.5,
772 ..Default::default()
773 });
774
775 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
776 let result = compressor.compress_heuristic(content).unwrap();
777 assert!(!result.is_empty() || content.is_empty());
779 }
780
781 #[test]
782 fn test_empty_content() {
783 let compressor = SemanticCompressor::new();
784 let result = compressor.compress("").unwrap();
785 assert_eq!(result, "");
786 }
787
788 #[test]
789 fn test_cosine_similarity_identical() {
790 let a = vec![1.0, 0.0, 0.0];
791 let b = vec![1.0, 0.0, 0.0];
792 let sim = cosine_similarity(&a, &b);
793 assert!((sim - 1.0).abs() < 0.001);
794 }
795
796 #[test]
797 fn test_cosine_similarity_orthogonal() {
798 let a = vec![1.0, 0.0, 0.0];
799 let c = vec![0.0, 1.0, 0.0];
800 let sim = cosine_similarity(&a, &c);
801 assert!(sim.abs() < 0.001);
802 }
803
804 #[test]
805 fn test_cosine_similarity_empty() {
806 let a: Vec<f32> = vec![];
807 let b: Vec<f32> = vec![];
808 assert_eq!(cosine_similarity(&a, &b), 0.0);
809 }
810
811 #[test]
813 fn test_repetitive_pattern_compression() {
814 let compressor = SemanticCompressor::new();
815 let content = "sentence ".repeat(500);
817 let result = compressor.compress(&content).unwrap();
818
819 assert!(
821 result.len() < content.len() / 2,
822 "Compressed size {} should be less than half of original {}",
823 result.len(),
824 content.len()
825 );
826
827 assert!(result.contains("sentence"));
829 assert!(
830 result.contains("repeated") || result.contains("pattern"),
831 "Should indicate compression occurred"
832 );
833 }
834
835 #[test]
836 fn test_repetitive_line_compression() {
837 let compressor = SemanticCompressor::new();
838 let content = "same line\n".repeat(100);
840 let result = compressor.compress(&content).unwrap();
841
842 assert!(
844 result.len() < content.len() / 2,
845 "Compressed size {} should be less than half of original {}",
846 result.len(),
847 content.len()
848 );
849 }
850
851 #[test]
852 fn test_non_repetitive_content_unchanged() {
853 let compressor = SemanticCompressor::new();
854 let content = "This is some unique content that does not repeat.";
856 let result = compressor.compress(content).unwrap();
857
858 assert_eq!(result, content);
860 }
861
862 #[test]
863 fn test_repetitive_with_variation() {
864 let compressor = SemanticCompressor::with_config(SemanticConfig {
865 budget_ratio: 0.3,
866 ..Default::default()
867 });
868
869 let mut content = String::new();
871 for i in 0..50 {
872 content.push_str(&format!("item {} ", i % 5)); }
874
875 let result = compressor.compress(&content).unwrap();
876 assert!(!result.is_empty());
879 }
880
881 #[test]
883 fn test_repetitive_unicode_chinese() {
884 let compressor = SemanticCompressor::new();
885 let content = "中文测试 ".repeat(100); let result = compressor.compress(&content).unwrap();
889
890 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
892
893 assert!(!result.is_empty() || content.is_empty());
895 }
896
897 #[test]
898 fn test_repetitive_unicode_emoji() {
899 let compressor = SemanticCompressor::new();
900 let content = "🎉🎊🎁 ".repeat(80); let result = compressor.compress(&content).unwrap();
904 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
905 assert!(!result.is_empty() || content.is_empty());
906 }
907
908 #[test]
909 fn test_repetitive_unicode_mixed() {
910 let compressor = SemanticCompressor::new();
911 let content = "a中🎉 ".repeat(60); let result = compressor.compress(&content).unwrap();
915 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
916 assert!(!result.is_empty() || content.is_empty());
917 }
918
919 #[test]
920 fn test_repetitive_unicode_cyrillic() {
921 let compressor = SemanticCompressor::new();
922 let content = "Привет ".repeat(50);
924
925 let result = compressor.compress(&content).unwrap();
926 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
927 }
928
929 #[test]
930 fn test_non_repetitive_unicode_boundary() {
931 let compressor = SemanticCompressor::new();
932 let content = "世界和平".repeat(60); let result = compressor.compress(&content).unwrap();
937 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
940 }
941
942 #[test]
943 fn test_repetitive_unicode_line_based() {
944 let compressor = SemanticCompressor::new();
945 let content = "中文行\n".repeat(100);
947
948 let result = compressor.compress(&content).unwrap();
949 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
950 }
951}