1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 if let Some(compressed) = self.compress_repetitive(content) {
247 return Ok(compressed);
248 }
249
250 #[cfg(feature = "embeddings")]
251 {
252 return self.compress_with_embeddings(content);
253 }
254
255 #[cfg(not(feature = "embeddings"))]
256 {
257 self.compress_heuristic(content)
258 }
259 }
260
261 fn compress_repetitive(&self, content: &str) -> Option<String> {
268 if content.len() < 200 {
270 return None;
271 }
272
273 for pattern_len in 1..=100.min(content.len() / 3) {
277 if !content.is_char_boundary(pattern_len) {
279 continue;
280 }
281
282 let pattern = &content[..pattern_len];
283
284 if pattern.chars().all(|c| c.is_whitespace()) {
286 continue;
287 }
288
289 let mut count = 0;
291 let mut pos = 0;
292 while pos + pattern_len <= content.len() {
293 if !content.is_char_boundary(pos) || !content.is_char_boundary(pos + pattern_len) {
295 break;
296 }
297 if &content[pos..pos + pattern_len] == pattern {
298 count += 1;
299 pos += pattern_len;
300 } else {
301 break;
302 }
303 }
304
305 let coverage = (count * pattern_len) as f32 / content.len() as f32;
307 if count >= 3 && coverage >= 0.8 {
308 let instances_to_show = (count as f32 * self.config.budget_ratio)
310 .ceil()
311 .clamp(1.0, 5.0) as usize;
312
313 let shown_content = pattern.repeat(instances_to_show);
314 let remainder_start = count * pattern_len;
316 let remainder = if remainder_start <= content.len() && content.is_char_boundary(remainder_start) {
317 &content[remainder_start..]
318 } else {
319 ""
320 };
321
322 let result = if remainder.is_empty() {
323 format!(
324 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
325 shown_content.trim_end(),
326 count,
327 instances_to_show
328 )
329 } else {
330 format!(
331 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
332 shown_content.trim_end(),
333 count,
334 instances_to_show,
335 remainder.trim()
336 )
337 };
338
339 return Some(result);
340 }
341 }
342
343 let lines: Vec<&str> = content.lines().collect();
345 if lines.len() >= 3 {
346 let mut line_counts: std::collections::HashMap<&str, usize> =
347 std::collections::HashMap::new();
348 for line in &lines {
349 *line_counts.entry(*line).or_insert(0) += 1;
350 }
351
352 if let Some((repeated_line, count)) = line_counts
354 .iter()
355 .filter(|(line, _)| !line.trim().is_empty())
356 .max_by_key(|(_, count)| *count)
357 {
358 let repetition_ratio = *count as f32 / lines.len() as f32;
359 if *count >= 3 && repetition_ratio >= 0.5 {
360 let mut result = String::new();
362 let mut consecutive_count = 0;
363 let mut last_was_repeated = false;
364
365 for line in &lines {
366 if *line == *repeated_line {
367 consecutive_count += 1;
368 if !last_was_repeated {
369 if !result.is_empty() {
370 result.push('\n');
371 }
372 result.push_str(line);
373 }
374 last_was_repeated = true;
375 } else {
376 if last_was_repeated && consecutive_count > 1 {
377 result.push_str(&format!(
378 "\n/* ... above line repeated {} times ... */",
379 consecutive_count
380 ));
381 }
382 consecutive_count = 0;
383 last_was_repeated = false;
384 if !result.is_empty() {
385 result.push('\n');
386 }
387 result.push_str(line);
388 }
389 }
390
391 if last_was_repeated && consecutive_count > 1 {
392 result.push_str(&format!(
393 "\n/* ... above line repeated {} times ... */",
394 consecutive_count
395 ));
396 }
397
398 if result.len() < content.len() / 2 {
400 return Some(result);
401 }
402 }
403 }
404 }
405
406 None
407 }
408
409 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
411 let mut chunks = Vec::new();
412 let mut current_start = 0;
413
414 for (i, _) in content.match_indices("\n\n") {
416 if i > current_start && i - current_start >= self.config.min_chunk_size {
417 let chunk_content = &content[current_start..i];
418 if chunk_content.len() <= self.config.max_chunk_size {
419 chunks.push(CodeChunk {
420 content: chunk_content.to_owned(),
421 start: current_start,
422 end: i,
423 embedding: None,
424 cluster_id: None,
425 });
426 }
427 current_start = i + 2;
428 }
429 }
430
431 if current_start < content.len() {
433 let remaining = &content[current_start..];
434 if remaining.len() >= self.config.min_chunk_size {
435 chunks.push(CodeChunk {
436 content: remaining.to_owned(),
437 start: current_start,
438 end: content.len(),
439 embedding: None,
440 cluster_id: None,
441 });
442 }
443 }
444
445 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
447 current_start = 0;
448 for (i, _) in content.match_indices('\n') {
449 if i > current_start && i - current_start >= self.config.min_chunk_size {
450 let chunk_content = &content[current_start..i];
451 if chunk_content.len() <= self.config.max_chunk_size {
452 chunks.push(CodeChunk {
453 content: chunk_content.to_owned(),
454 start: current_start,
455 end: i,
456 embedding: None,
457 cluster_id: None,
458 });
459 }
460 current_start = i + 1;
461 }
462 }
463 if current_start < content.len() {
465 let remaining = &content[current_start..];
466 if remaining.len() >= self.config.min_chunk_size {
467 chunks.push(CodeChunk {
468 content: remaining.to_owned(),
469 start: current_start,
470 end: content.len(),
471 embedding: None,
472 cluster_id: None,
473 });
474 }
475 }
476 }
477
478 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
480 current_start = 0;
481 for (i, _) in content.match_indices(". ") {
482 if i > current_start && i - current_start >= self.config.min_chunk_size {
483 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
485 chunks.push(CodeChunk {
486 content: chunk_content.to_owned(),
487 start: current_start,
488 end: i + 1,
489 embedding: None,
490 cluster_id: None,
491 });
492 }
493 current_start = i + 2;
494 }
495 }
496 if current_start < content.len() {
498 let remaining = &content[current_start..];
499 if remaining.len() >= self.config.min_chunk_size {
500 chunks.push(CodeChunk {
501 content: remaining.to_owned(),
502 start: current_start,
503 end: content.len(),
504 embedding: None,
505 cluster_id: None,
506 });
507 }
508 }
509 }
510
511 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
513 let mut pos = 0;
514 while pos < content.len() {
515 let end = (pos + self.config.max_chunk_size).min(content.len());
516 chunks.push(CodeChunk {
517 content: content[pos..end].to_owned(),
518 start: pos,
519 end,
520 embedding: None,
521 cluster_id: None,
522 });
523 pos = end;
524 }
525 }
526
527 chunks
528 }
529
530 fn compress_heuristic(&self, content: &str) -> Result<String> {
532 let chunks = self.split_into_chunks(content);
533
534 if chunks.is_empty() {
535 return Ok(content.to_owned());
536 }
537
538 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
540 let step = chunks.len() / target_chunks.max(1);
541
542 let mut result = String::new();
543 let mut kept = 0;
544
545 for (i, chunk) in chunks.iter().enumerate() {
546 if i % step.max(1) == 0 && kept < target_chunks {
547 if !result.is_empty() {
548 result.push_str("\n\n");
549 }
550 result.push_str(&chunk.content);
551 kept += 1;
552 }
553 }
554
555 if kept < chunks.len() {
557 result.push_str(&format!(
558 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
559 chunks.len() - kept,
560 (kept as f32 / chunks.len() as f32) * 100.0
561 ));
562 }
563
564 Ok(result)
565 }
566
567 #[cfg(feature = "embeddings")]
569 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
570 let mut chunks = self.split_into_chunks(content);
571
572 if chunks.is_empty() {
573 return Ok(content.to_owned());
574 }
575
576 for chunk in &mut chunks {
578 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
579 }
580
581 let clusters = self.cluster_chunks(&chunks)?;
583
584 let mut result = String::new();
586 for cluster in clusters.values() {
587 if let Some(representative) = self.select_representative(cluster) {
588 if !result.is_empty() {
589 result.push_str("\n\n");
590 }
591 result.push_str(&representative.content);
592 }
593 }
594
595 Ok(result)
596 }
597
598 #[cfg(feature = "embeddings")]
600 fn cluster_chunks<'a>(
601 &self,
602 chunks: &'a [CodeChunk],
603 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
604 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
605 let mut next_cluster = 0;
606
607 for chunk in chunks {
608 let embedding = chunk
609 .embedding
610 .as_ref()
611 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
612
613 let mut target_cluster = None;
615 for (&cluster_id, cluster_chunks) in &clusters {
616 if let Some(first) = cluster_chunks.first() {
617 if let Some(ref first_emb) = first.embedding {
618 let similarity = cosine_similarity(embedding, first_emb);
619 if similarity >= self.config.similarity_threshold {
620 target_cluster = Some(cluster_id);
621 break;
622 }
623 }
624 }
625 }
626
627 if let Some(cluster_id) = target_cluster {
628 if let Some(cluster) = clusters.get_mut(&cluster_id) {
629 cluster.push(chunk);
630 }
631 } else {
632 clusters.insert(next_cluster, vec![chunk]);
633 next_cluster += 1;
634 }
635 }
636
637 Ok(clusters)
638 }
639
640 #[cfg(feature = "embeddings")]
642 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
643 chunks.iter().max_by_key(|c| c.content.len()).copied()
645 }
646}
647
648impl Default for SemanticCompressor {
649 fn default() -> Self {
650 Self::new()
651 }
652}
653
654pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
667
668pub type HeuristicCompressor = SemanticCompressor;
674
675pub type HeuristicCompressionConfig = SemanticConfig;
677
678#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
693fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
694 if a.len() != b.len() || a.is_empty() {
695 return 0.0;
696 }
697
698 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
699 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
700 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
701
702 if norm_a == 0.0 || norm_b == 0.0 {
703 return 0.0;
704 }
705
706 dot / (norm_a * norm_b)
707}
708
709#[cfg(test)]
714mod tests {
715 use super::*;
716
717 #[test]
718 fn test_analyzer_creation() {
719 let analyzer = SemanticAnalyzer::new();
720 #[cfg(feature = "embeddings")]
723 assert!(analyzer.model_path().is_none());
724 #[cfg(not(feature = "embeddings"))]
725 drop(analyzer); }
727
728 #[test]
729 fn test_analyzer_with_model() {
730 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
731 #[cfg(feature = "embeddings")]
732 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
733 #[cfg(not(feature = "embeddings"))]
734 drop(analyzer); }
736
737 #[test]
738 fn test_compressor_analyzer_access() {
739 let compressor = SemanticCompressor::new();
740 let _analyzer = compressor.analyzer();
742 }
743
744 #[test]
745 fn test_semantic_config_default() {
746 let config = SemanticConfig::default();
747 assert_eq!(config.similarity_threshold, 0.7);
748 assert_eq!(config.budget_ratio, 0.5);
749 }
750
751 #[test]
752 fn test_split_into_chunks() {
753 let compressor = SemanticCompressor::with_config(SemanticConfig {
754 min_chunk_size: 10,
755 max_chunk_size: 1000,
756 ..Default::default()
757 });
758
759 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
760 let chunks = compressor.split_into_chunks(content);
761 assert!(chunks.len() >= 2);
762 }
763
764 #[test]
765 fn test_heuristic_compression() {
766 let compressor = SemanticCompressor::with_config(SemanticConfig {
767 min_chunk_size: 5,
768 max_chunk_size: 100,
769 budget_ratio: 0.5,
770 ..Default::default()
771 });
772
773 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
774 let result = compressor.compress_heuristic(content).unwrap();
775 assert!(!result.is_empty() || content.is_empty());
777 }
778
779 #[test]
780 fn test_empty_content() {
781 let compressor = SemanticCompressor::new();
782 let result = compressor.compress("").unwrap();
783 assert_eq!(result, "");
784 }
785
786 #[test]
787 fn test_cosine_similarity_identical() {
788 let a = vec![1.0, 0.0, 0.0];
789 let b = vec![1.0, 0.0, 0.0];
790 let sim = cosine_similarity(&a, &b);
791 assert!((sim - 1.0).abs() < 0.001);
792 }
793
794 #[test]
795 fn test_cosine_similarity_orthogonal() {
796 let a = vec![1.0, 0.0, 0.0];
797 let c = vec![0.0, 1.0, 0.0];
798 let sim = cosine_similarity(&a, &c);
799 assert!(sim.abs() < 0.001);
800 }
801
802 #[test]
803 fn test_cosine_similarity_empty() {
804 let a: Vec<f32> = vec![];
805 let b: Vec<f32> = vec![];
806 assert_eq!(cosine_similarity(&a, &b), 0.0);
807 }
808
809 #[test]
811 fn test_repetitive_pattern_compression() {
812 let compressor = SemanticCompressor::new();
813 let content = "sentence ".repeat(500);
815 let result = compressor.compress(&content).unwrap();
816
817 assert!(
819 result.len() < content.len() / 2,
820 "Compressed size {} should be less than half of original {}",
821 result.len(),
822 content.len()
823 );
824
825 assert!(result.contains("sentence"));
827 assert!(
828 result.contains("repeated") || result.contains("pattern"),
829 "Should indicate compression occurred"
830 );
831 }
832
833 #[test]
834 fn test_repetitive_line_compression() {
835 let compressor = SemanticCompressor::new();
836 let content = "same line\n".repeat(100);
838 let result = compressor.compress(&content).unwrap();
839
840 assert!(
842 result.len() < content.len() / 2,
843 "Compressed size {} should be less than half of original {}",
844 result.len(),
845 content.len()
846 );
847 }
848
849 #[test]
850 fn test_non_repetitive_content_unchanged() {
851 let compressor = SemanticCompressor::new();
852 let content = "This is some unique content that does not repeat.";
854 let result = compressor.compress(content).unwrap();
855
856 assert_eq!(result, content);
858 }
859
860 #[test]
861 fn test_repetitive_with_variation() {
862 let compressor = SemanticCompressor::with_config(SemanticConfig {
863 budget_ratio: 0.3,
864 ..Default::default()
865 });
866
867 let mut content = String::new();
869 for i in 0..50 {
870 content.push_str(&format!("item {} ", i % 5)); }
872
873 let result = compressor.compress(&content).unwrap();
874 assert!(!result.is_empty());
877 }
878
879 #[test]
881 fn test_repetitive_unicode_chinese() {
882 let compressor = SemanticCompressor::new();
883 let content = "中文测试 ".repeat(100); let result = compressor.compress(&content).unwrap();
887
888 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
890
891 assert!(!result.is_empty() || content.is_empty());
893 }
894
895 #[test]
896 fn test_repetitive_unicode_emoji() {
897 let compressor = SemanticCompressor::new();
898 let content = "🎉🎊🎁 ".repeat(80); let result = compressor.compress(&content).unwrap();
902 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
903 assert!(!result.is_empty() || content.is_empty());
904 }
905
906 #[test]
907 fn test_repetitive_unicode_mixed() {
908 let compressor = SemanticCompressor::new();
909 let content = "a中🎉 ".repeat(60); let result = compressor.compress(&content).unwrap();
913 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
914 assert!(!result.is_empty() || content.is_empty());
915 }
916
917 #[test]
918 fn test_repetitive_unicode_cyrillic() {
919 let compressor = SemanticCompressor::new();
920 let content = "Привет ".repeat(50);
922
923 let result = compressor.compress(&content).unwrap();
924 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
925 }
926
927 #[test]
928 fn test_non_repetitive_unicode_boundary() {
929 let compressor = SemanticCompressor::new();
930 let content = "世界和平".repeat(60); let result = compressor.compress(&content).unwrap();
935 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
938 }
939
940 #[test]
941 fn test_repetitive_unicode_line_based() {
942 let compressor = SemanticCompressor::new();
943 let content = "中文行\n".repeat(100);
945
946 let result = compressor.compress(&content).unwrap();
947 assert!(std::str::from_utf8(result.as_bytes()).is_ok());
948 }
949}