1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 if let Some(compressed) = self.compress_repetitive(content) {
247 return Ok(compressed);
248 }
249
250 #[cfg(feature = "embeddings")]
251 {
252 return self.compress_with_embeddings(content);
253 }
254
255 #[cfg(not(feature = "embeddings"))]
256 {
257 self.compress_heuristic(content)
258 }
259 }
260
261 fn compress_repetitive(&self, content: &str) -> Option<String> {
266 if content.len() < 200 {
268 return None;
269 }
270
271 for pattern_len in 1..=100.min(content.len() / 3) {
274 let pattern = &content[..pattern_len];
275
276 if pattern.chars().all(|c| c.is_whitespace()) {
278 continue;
279 }
280
281 let mut count = 0;
283 let mut pos = 0;
284 while pos + pattern_len <= content.len() {
285 if &content[pos..pos + pattern_len] == pattern {
286 count += 1;
287 pos += pattern_len;
288 } else {
289 break;
290 }
291 }
292
293 let coverage = (count * pattern_len) as f32 / content.len() as f32;
295 if count >= 3 && coverage >= 0.8 {
296 let instances_to_show = (count as f32 * self.config.budget_ratio)
298 .ceil()
299 .max(1.0)
300 .min(5.0) as usize;
301
302 let shown_content = pattern.repeat(instances_to_show);
303 let remainder = &content[count * pattern_len..];
304
305 let result = if remainder.is_empty() {
306 format!(
307 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
308 shown_content.trim_end(),
309 count,
310 instances_to_show
311 )
312 } else {
313 format!(
314 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
315 shown_content.trim_end(),
316 count,
317 instances_to_show,
318 remainder.trim()
319 )
320 };
321
322 return Some(result);
323 }
324 }
325
326 let lines: Vec<&str> = content.lines().collect();
328 if lines.len() >= 3 {
329 let mut line_counts: std::collections::HashMap<&str, usize> =
330 std::collections::HashMap::new();
331 for line in &lines {
332 *line_counts.entry(*line).or_insert(0) += 1;
333 }
334
335 if let Some((repeated_line, count)) = line_counts
337 .iter()
338 .filter(|(line, _)| !line.trim().is_empty())
339 .max_by_key(|(_, count)| *count)
340 {
341 let repetition_ratio = *count as f32 / lines.len() as f32;
342 if *count >= 3 && repetition_ratio >= 0.5 {
343 let mut result = String::new();
345 let mut consecutive_count = 0;
346 let mut last_was_repeated = false;
347
348 for line in &lines {
349 if *line == *repeated_line {
350 consecutive_count += 1;
351 if !last_was_repeated {
352 if !result.is_empty() {
353 result.push('\n');
354 }
355 result.push_str(line);
356 }
357 last_was_repeated = true;
358 } else {
359 if last_was_repeated && consecutive_count > 1 {
360 result.push_str(&format!(
361 "\n/* ... above line repeated {} times ... */",
362 consecutive_count
363 ));
364 }
365 consecutive_count = 0;
366 last_was_repeated = false;
367 if !result.is_empty() {
368 result.push('\n');
369 }
370 result.push_str(line);
371 }
372 }
373
374 if last_was_repeated && consecutive_count > 1 {
375 result.push_str(&format!(
376 "\n/* ... above line repeated {} times ... */",
377 consecutive_count
378 ));
379 }
380
381 if result.len() < content.len() / 2 {
383 return Some(result);
384 }
385 }
386 }
387 }
388
389 None
390 }
391
392 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
394 let mut chunks = Vec::new();
395 let mut current_start = 0;
396
397 for (i, _) in content.match_indices("\n\n") {
399 if i > current_start && i - current_start >= self.config.min_chunk_size {
400 let chunk_content = &content[current_start..i];
401 if chunk_content.len() <= self.config.max_chunk_size {
402 chunks.push(CodeChunk {
403 content: chunk_content.to_owned(),
404 start: current_start,
405 end: i,
406 embedding: None,
407 cluster_id: None,
408 });
409 }
410 current_start = i + 2;
411 }
412 }
413
414 if current_start < content.len() {
416 let remaining = &content[current_start..];
417 if remaining.len() >= self.config.min_chunk_size {
418 chunks.push(CodeChunk {
419 content: remaining.to_owned(),
420 start: current_start,
421 end: content.len(),
422 embedding: None,
423 cluster_id: None,
424 });
425 }
426 }
427
428 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
430 current_start = 0;
431 for (i, _) in content.match_indices('\n') {
432 if i > current_start && i - current_start >= self.config.min_chunk_size {
433 let chunk_content = &content[current_start..i];
434 if chunk_content.len() <= self.config.max_chunk_size {
435 chunks.push(CodeChunk {
436 content: chunk_content.to_owned(),
437 start: current_start,
438 end: i,
439 embedding: None,
440 cluster_id: None,
441 });
442 }
443 current_start = i + 1;
444 }
445 }
446 if current_start < content.len() {
448 let remaining = &content[current_start..];
449 if remaining.len() >= self.config.min_chunk_size {
450 chunks.push(CodeChunk {
451 content: remaining.to_owned(),
452 start: current_start,
453 end: content.len(),
454 embedding: None,
455 cluster_id: None,
456 });
457 }
458 }
459 }
460
461 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
463 current_start = 0;
464 for (i, _) in content.match_indices(". ") {
465 if i > current_start && i - current_start >= self.config.min_chunk_size {
466 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
468 chunks.push(CodeChunk {
469 content: chunk_content.to_owned(),
470 start: current_start,
471 end: i + 1,
472 embedding: None,
473 cluster_id: None,
474 });
475 }
476 current_start = i + 2;
477 }
478 }
479 if current_start < content.len() {
481 let remaining = &content[current_start..];
482 if remaining.len() >= self.config.min_chunk_size {
483 chunks.push(CodeChunk {
484 content: remaining.to_owned(),
485 start: current_start,
486 end: content.len(),
487 embedding: None,
488 cluster_id: None,
489 });
490 }
491 }
492 }
493
494 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
496 let mut pos = 0;
497 while pos < content.len() {
498 let end = (pos + self.config.max_chunk_size).min(content.len());
499 chunks.push(CodeChunk {
500 content: content[pos..end].to_owned(),
501 start: pos,
502 end,
503 embedding: None,
504 cluster_id: None,
505 });
506 pos = end;
507 }
508 }
509
510 chunks
511 }
512
513 fn compress_heuristic(&self, content: &str) -> Result<String> {
515 let chunks = self.split_into_chunks(content);
516
517 if chunks.is_empty() {
518 return Ok(content.to_owned());
519 }
520
521 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
523 let step = chunks.len() / target_chunks.max(1);
524
525 let mut result = String::new();
526 let mut kept = 0;
527
528 for (i, chunk) in chunks.iter().enumerate() {
529 if i % step.max(1) == 0 && kept < target_chunks {
530 if !result.is_empty() {
531 result.push_str("\n\n");
532 }
533 result.push_str(&chunk.content);
534 kept += 1;
535 }
536 }
537
538 if kept < chunks.len() {
540 result.push_str(&format!(
541 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
542 chunks.len() - kept,
543 (kept as f32 / chunks.len() as f32) * 100.0
544 ));
545 }
546
547 Ok(result)
548 }
549
550 #[cfg(feature = "embeddings")]
552 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
553 let mut chunks = self.split_into_chunks(content);
554
555 if chunks.is_empty() {
556 return Ok(content.to_owned());
557 }
558
559 for chunk in &mut chunks {
561 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
562 }
563
564 let clusters = self.cluster_chunks(&chunks)?;
566
567 let mut result = String::new();
569 for cluster in clusters.values() {
570 if let Some(representative) = self.select_representative(cluster) {
571 if !result.is_empty() {
572 result.push_str("\n\n");
573 }
574 result.push_str(&representative.content);
575 }
576 }
577
578 Ok(result)
579 }
580
581 #[cfg(feature = "embeddings")]
583 fn cluster_chunks<'a>(
584 &self,
585 chunks: &'a [CodeChunk],
586 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
587 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
588 let mut next_cluster = 0;
589
590 for chunk in chunks {
591 let embedding = chunk
592 .embedding
593 .as_ref()
594 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
595
596 let mut assigned = false;
598 for (&cluster_id, cluster_chunks) in &clusters {
599 if let Some(first) = cluster_chunks.first() {
600 if let Some(ref first_emb) = first.embedding {
601 let similarity = cosine_similarity(embedding, first_emb);
602 if similarity >= self.config.similarity_threshold {
603 clusters.get_mut(&cluster_id).unwrap().push(chunk);
604 assigned = true;
605 break;
606 }
607 }
608 }
609 }
610
611 if !assigned {
612 clusters.insert(next_cluster, vec![chunk]);
613 next_cluster += 1;
614 }
615 }
616
617 Ok(clusters)
618 }
619
620 #[cfg(feature = "embeddings")]
622 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
623 chunks.iter().max_by_key(|c| c.content.len()).copied()
625 }
626}
627
628impl Default for SemanticCompressor {
629 fn default() -> Self {
630 Self::new()
631 }
632}
633
634pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
647
648pub type HeuristicCompressor = SemanticCompressor;
654
655pub type HeuristicCompressionConfig = SemanticConfig;
657
658#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
673fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
674 if a.len() != b.len() || a.is_empty() {
675 return 0.0;
676 }
677
678 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
679 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
680 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
681
682 if norm_a == 0.0 || norm_b == 0.0 {
683 return 0.0;
684 }
685
686 dot / (norm_a * norm_b)
687}
688
689#[cfg(test)]
694mod tests {
695 use super::*;
696
697 #[test]
698 fn test_analyzer_creation() {
699 let analyzer = SemanticAnalyzer::new();
700 #[cfg(feature = "embeddings")]
703 assert!(analyzer.model_path().is_none());
704 #[cfg(not(feature = "embeddings"))]
705 drop(analyzer); }
707
708 #[test]
709 fn test_analyzer_with_model() {
710 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
711 #[cfg(feature = "embeddings")]
712 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
713 #[cfg(not(feature = "embeddings"))]
714 drop(analyzer); }
716
717 #[test]
718 fn test_compressor_analyzer_access() {
719 let compressor = SemanticCompressor::new();
720 let _analyzer = compressor.analyzer();
722 }
723
724 #[test]
725 fn test_semantic_config_default() {
726 let config = SemanticConfig::default();
727 assert_eq!(config.similarity_threshold, 0.7);
728 assert_eq!(config.budget_ratio, 0.5);
729 }
730
731 #[test]
732 fn test_split_into_chunks() {
733 let compressor = SemanticCompressor::with_config(SemanticConfig {
734 min_chunk_size: 10,
735 max_chunk_size: 1000,
736 ..Default::default()
737 });
738
739 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
740 let chunks = compressor.split_into_chunks(content);
741 assert!(chunks.len() >= 2);
742 }
743
744 #[test]
745 fn test_heuristic_compression() {
746 let compressor = SemanticCompressor::with_config(SemanticConfig {
747 min_chunk_size: 5,
748 max_chunk_size: 100,
749 budget_ratio: 0.5,
750 ..Default::default()
751 });
752
753 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
754 let result = compressor.compress_heuristic(content).unwrap();
755 assert!(!result.is_empty() || content.is_empty());
757 }
758
759 #[test]
760 fn test_empty_content() {
761 let compressor = SemanticCompressor::new();
762 let result = compressor.compress("").unwrap();
763 assert_eq!(result, "");
764 }
765
766 #[test]
767 fn test_cosine_similarity_identical() {
768 let a = vec![1.0, 0.0, 0.0];
769 let b = vec![1.0, 0.0, 0.0];
770 let sim = cosine_similarity(&a, &b);
771 assert!((sim - 1.0).abs() < 0.001);
772 }
773
774 #[test]
775 fn test_cosine_similarity_orthogonal() {
776 let a = vec![1.0, 0.0, 0.0];
777 let c = vec![0.0, 1.0, 0.0];
778 let sim = cosine_similarity(&a, &c);
779 assert!(sim.abs() < 0.001);
780 }
781
782 #[test]
783 fn test_cosine_similarity_empty() {
784 let a: Vec<f32> = vec![];
785 let b: Vec<f32> = vec![];
786 assert_eq!(cosine_similarity(&a, &b), 0.0);
787 }
788
789 #[test]
791 fn test_repetitive_pattern_compression() {
792 let compressor = SemanticCompressor::new();
793 let content = "sentence ".repeat(500);
795 let result = compressor.compress(&content).unwrap();
796
797 assert!(
799 result.len() < content.len() / 2,
800 "Compressed size {} should be less than half of original {}",
801 result.len(),
802 content.len()
803 );
804
805 assert!(result.contains("sentence"));
807 assert!(
808 result.contains("repeated") || result.contains("pattern"),
809 "Should indicate compression occurred"
810 );
811 }
812
813 #[test]
814 fn test_repetitive_line_compression() {
815 let compressor = SemanticCompressor::new();
816 let content = "same line\n".repeat(100);
818 let result = compressor.compress(&content).unwrap();
819
820 assert!(
822 result.len() < content.len() / 2,
823 "Compressed size {} should be less than half of original {}",
824 result.len(),
825 content.len()
826 );
827 }
828
829 #[test]
830 fn test_non_repetitive_content_unchanged() {
831 let compressor = SemanticCompressor::new();
832 let content = "This is some unique content that does not repeat.";
834 let result = compressor.compress(content).unwrap();
835
836 assert_eq!(result, content);
838 }
839
840 #[test]
841 fn test_repetitive_with_variation() {
842 let compressor = SemanticCompressor::with_config(SemanticConfig {
843 budget_ratio: 0.3,
844 ..Default::default()
845 });
846
847 let mut content = String::new();
849 for i in 0..50 {
850 content.push_str(&format!("item {} ", i % 5)); }
852
853 let result = compressor.compress(&content).unwrap();
854 assert!(!result.is_empty());
857 }
858}