1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 if let Some(compressed) = self.compress_repetitive(content) {
247 return Ok(compressed);
248 }
249
250 #[cfg(feature = "embeddings")]
251 {
252 return self.compress_with_embeddings(content);
253 }
254
255 #[cfg(not(feature = "embeddings"))]
256 {
257 self.compress_heuristic(content)
258 }
259 }
260
261 fn compress_repetitive(&self, content: &str) -> Option<String> {
266 if content.len() < 200 {
268 return None;
269 }
270
271 for pattern_len in 1..=100.min(content.len() / 3) {
274 let pattern = &content[..pattern_len];
275
276 if pattern.chars().all(|c| c.is_whitespace()) {
278 continue;
279 }
280
281 let mut count = 0;
283 let mut pos = 0;
284 while pos + pattern_len <= content.len() {
285 if &content[pos..pos + pattern_len] == pattern {
286 count += 1;
287 pos += pattern_len;
288 } else {
289 break;
290 }
291 }
292
293 let coverage = (count * pattern_len) as f32 / content.len() as f32;
295 if count >= 3 && coverage >= 0.8 {
296 let instances_to_show = (count as f32 * self.config.budget_ratio)
298 .ceil()
299 .clamp(1.0, 5.0) as usize;
300
301 let shown_content = pattern.repeat(instances_to_show);
302 let remainder = &content[count * pattern_len..];
303
304 let result = if remainder.is_empty() {
305 format!(
306 "{}\n/* ... pattern repeated {} times (showing {}) ... */",
307 shown_content.trim_end(),
308 count,
309 instances_to_show
310 )
311 } else {
312 format!(
313 "{}\n/* ... pattern repeated {} times (showing {}) ... */\n{}",
314 shown_content.trim_end(),
315 count,
316 instances_to_show,
317 remainder.trim()
318 )
319 };
320
321 return Some(result);
322 }
323 }
324
325 let lines: Vec<&str> = content.lines().collect();
327 if lines.len() >= 3 {
328 let mut line_counts: std::collections::HashMap<&str, usize> =
329 std::collections::HashMap::new();
330 for line in &lines {
331 *line_counts.entry(*line).or_insert(0) += 1;
332 }
333
334 if let Some((repeated_line, count)) = line_counts
336 .iter()
337 .filter(|(line, _)| !line.trim().is_empty())
338 .max_by_key(|(_, count)| *count)
339 {
340 let repetition_ratio = *count as f32 / lines.len() as f32;
341 if *count >= 3 && repetition_ratio >= 0.5 {
342 let mut result = String::new();
344 let mut consecutive_count = 0;
345 let mut last_was_repeated = false;
346
347 for line in &lines {
348 if *line == *repeated_line {
349 consecutive_count += 1;
350 if !last_was_repeated {
351 if !result.is_empty() {
352 result.push('\n');
353 }
354 result.push_str(line);
355 }
356 last_was_repeated = true;
357 } else {
358 if last_was_repeated && consecutive_count > 1 {
359 result.push_str(&format!(
360 "\n/* ... above line repeated {} times ... */",
361 consecutive_count
362 ));
363 }
364 consecutive_count = 0;
365 last_was_repeated = false;
366 if !result.is_empty() {
367 result.push('\n');
368 }
369 result.push_str(line);
370 }
371 }
372
373 if last_was_repeated && consecutive_count > 1 {
374 result.push_str(&format!(
375 "\n/* ... above line repeated {} times ... */",
376 consecutive_count
377 ));
378 }
379
380 if result.len() < content.len() / 2 {
382 return Some(result);
383 }
384 }
385 }
386 }
387
388 None
389 }
390
391 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
393 let mut chunks = Vec::new();
394 let mut current_start = 0;
395
396 for (i, _) in content.match_indices("\n\n") {
398 if i > current_start && i - current_start >= self.config.min_chunk_size {
399 let chunk_content = &content[current_start..i];
400 if chunk_content.len() <= self.config.max_chunk_size {
401 chunks.push(CodeChunk {
402 content: chunk_content.to_owned(),
403 start: current_start,
404 end: i,
405 embedding: None,
406 cluster_id: None,
407 });
408 }
409 current_start = i + 2;
410 }
411 }
412
413 if current_start < content.len() {
415 let remaining = &content[current_start..];
416 if remaining.len() >= self.config.min_chunk_size {
417 chunks.push(CodeChunk {
418 content: remaining.to_owned(),
419 start: current_start,
420 end: content.len(),
421 embedding: None,
422 cluster_id: None,
423 });
424 }
425 }
426
427 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
429 current_start = 0;
430 for (i, _) in content.match_indices('\n') {
431 if i > current_start && i - current_start >= self.config.min_chunk_size {
432 let chunk_content = &content[current_start..i];
433 if chunk_content.len() <= self.config.max_chunk_size {
434 chunks.push(CodeChunk {
435 content: chunk_content.to_owned(),
436 start: current_start,
437 end: i,
438 embedding: None,
439 cluster_id: None,
440 });
441 }
442 current_start = i + 1;
443 }
444 }
445 if current_start < content.len() {
447 let remaining = &content[current_start..];
448 if remaining.len() >= self.config.min_chunk_size {
449 chunks.push(CodeChunk {
450 content: remaining.to_owned(),
451 start: current_start,
452 end: content.len(),
453 embedding: None,
454 cluster_id: None,
455 });
456 }
457 }
458 }
459
460 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
462 current_start = 0;
463 for (i, _) in content.match_indices(". ") {
464 if i > current_start && i - current_start >= self.config.min_chunk_size {
465 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
467 chunks.push(CodeChunk {
468 content: chunk_content.to_owned(),
469 start: current_start,
470 end: i + 1,
471 embedding: None,
472 cluster_id: None,
473 });
474 }
475 current_start = i + 2;
476 }
477 }
478 if current_start < content.len() {
480 let remaining = &content[current_start..];
481 if remaining.len() >= self.config.min_chunk_size {
482 chunks.push(CodeChunk {
483 content: remaining.to_owned(),
484 start: current_start,
485 end: content.len(),
486 embedding: None,
487 cluster_id: None,
488 });
489 }
490 }
491 }
492
493 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
495 let mut pos = 0;
496 while pos < content.len() {
497 let end = (pos + self.config.max_chunk_size).min(content.len());
498 chunks.push(CodeChunk {
499 content: content[pos..end].to_owned(),
500 start: pos,
501 end,
502 embedding: None,
503 cluster_id: None,
504 });
505 pos = end;
506 }
507 }
508
509 chunks
510 }
511
512 fn compress_heuristic(&self, content: &str) -> Result<String> {
514 let chunks = self.split_into_chunks(content);
515
516 if chunks.is_empty() {
517 return Ok(content.to_owned());
518 }
519
520 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
522 let step = chunks.len() / target_chunks.max(1);
523
524 let mut result = String::new();
525 let mut kept = 0;
526
527 for (i, chunk) in chunks.iter().enumerate() {
528 if i % step.max(1) == 0 && kept < target_chunks {
529 if !result.is_empty() {
530 result.push_str("\n\n");
531 }
532 result.push_str(&chunk.content);
533 kept += 1;
534 }
535 }
536
537 if kept < chunks.len() {
539 result.push_str(&format!(
540 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
541 chunks.len() - kept,
542 (kept as f32 / chunks.len() as f32) * 100.0
543 ));
544 }
545
546 Ok(result)
547 }
548
549 #[cfg(feature = "embeddings")]
551 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
552 let mut chunks = self.split_into_chunks(content);
553
554 if chunks.is_empty() {
555 return Ok(content.to_owned());
556 }
557
558 for chunk in &mut chunks {
560 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
561 }
562
563 let clusters = self.cluster_chunks(&chunks)?;
565
566 let mut result = String::new();
568 for cluster in clusters.values() {
569 if let Some(representative) = self.select_representative(cluster) {
570 if !result.is_empty() {
571 result.push_str("\n\n");
572 }
573 result.push_str(&representative.content);
574 }
575 }
576
577 Ok(result)
578 }
579
580 #[cfg(feature = "embeddings")]
582 fn cluster_chunks<'a>(
583 &self,
584 chunks: &'a [CodeChunk],
585 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
586 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
587 let mut next_cluster = 0;
588
589 for chunk in chunks {
590 let embedding = chunk
591 .embedding
592 .as_ref()
593 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
594
595 let mut target_cluster = None;
597 for (&cluster_id, cluster_chunks) in &clusters {
598 if let Some(first) = cluster_chunks.first() {
599 if let Some(ref first_emb) = first.embedding {
600 let similarity = cosine_similarity(embedding, first_emb);
601 if similarity >= self.config.similarity_threshold {
602 target_cluster = Some(cluster_id);
603 break;
604 }
605 }
606 }
607 }
608
609 if let Some(cluster_id) = target_cluster {
610 if let Some(cluster) = clusters.get_mut(&cluster_id) {
611 cluster.push(chunk);
612 }
613 } else {
614 clusters.insert(next_cluster, vec![chunk]);
615 next_cluster += 1;
616 }
617 }
618
619 Ok(clusters)
620 }
621
622 #[cfg(feature = "embeddings")]
624 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
625 chunks.iter().max_by_key(|c| c.content.len()).copied()
627 }
628}
629
630impl Default for SemanticCompressor {
631 fn default() -> Self {
632 Self::new()
633 }
634}
635
636pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
649
650pub type HeuristicCompressor = SemanticCompressor;
656
657pub type HeuristicCompressionConfig = SemanticConfig;
659
660#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
675fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
676 if a.len() != b.len() || a.is_empty() {
677 return 0.0;
678 }
679
680 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
681 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
682 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
683
684 if norm_a == 0.0 || norm_b == 0.0 {
685 return 0.0;
686 }
687
688 dot / (norm_a * norm_b)
689}
690
691#[cfg(test)]
696mod tests {
697 use super::*;
698
699 #[test]
700 fn test_analyzer_creation() {
701 let analyzer = SemanticAnalyzer::new();
702 #[cfg(feature = "embeddings")]
705 assert!(analyzer.model_path().is_none());
706 #[cfg(not(feature = "embeddings"))]
707 drop(analyzer); }
709
710 #[test]
711 fn test_analyzer_with_model() {
712 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
713 #[cfg(feature = "embeddings")]
714 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
715 #[cfg(not(feature = "embeddings"))]
716 drop(analyzer); }
718
719 #[test]
720 fn test_compressor_analyzer_access() {
721 let compressor = SemanticCompressor::new();
722 let _analyzer = compressor.analyzer();
724 }
725
726 #[test]
727 fn test_semantic_config_default() {
728 let config = SemanticConfig::default();
729 assert_eq!(config.similarity_threshold, 0.7);
730 assert_eq!(config.budget_ratio, 0.5);
731 }
732
733 #[test]
734 fn test_split_into_chunks() {
735 let compressor = SemanticCompressor::with_config(SemanticConfig {
736 min_chunk_size: 10,
737 max_chunk_size: 1000,
738 ..Default::default()
739 });
740
741 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
742 let chunks = compressor.split_into_chunks(content);
743 assert!(chunks.len() >= 2);
744 }
745
746 #[test]
747 fn test_heuristic_compression() {
748 let compressor = SemanticCompressor::with_config(SemanticConfig {
749 min_chunk_size: 5,
750 max_chunk_size: 100,
751 budget_ratio: 0.5,
752 ..Default::default()
753 });
754
755 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
756 let result = compressor.compress_heuristic(content).unwrap();
757 assert!(!result.is_empty() || content.is_empty());
759 }
760
761 #[test]
762 fn test_empty_content() {
763 let compressor = SemanticCompressor::new();
764 let result = compressor.compress("").unwrap();
765 assert_eq!(result, "");
766 }
767
768 #[test]
769 fn test_cosine_similarity_identical() {
770 let a = vec![1.0, 0.0, 0.0];
771 let b = vec![1.0, 0.0, 0.0];
772 let sim = cosine_similarity(&a, &b);
773 assert!((sim - 1.0).abs() < 0.001);
774 }
775
776 #[test]
777 fn test_cosine_similarity_orthogonal() {
778 let a = vec![1.0, 0.0, 0.0];
779 let c = vec![0.0, 1.0, 0.0];
780 let sim = cosine_similarity(&a, &c);
781 assert!(sim.abs() < 0.001);
782 }
783
784 #[test]
785 fn test_cosine_similarity_empty() {
786 let a: Vec<f32> = vec![];
787 let b: Vec<f32> = vec![];
788 assert_eq!(cosine_similarity(&a, &b), 0.0);
789 }
790
791 #[test]
793 fn test_repetitive_pattern_compression() {
794 let compressor = SemanticCompressor::new();
795 let content = "sentence ".repeat(500);
797 let result = compressor.compress(&content).unwrap();
798
799 assert!(
801 result.len() < content.len() / 2,
802 "Compressed size {} should be less than half of original {}",
803 result.len(),
804 content.len()
805 );
806
807 assert!(result.contains("sentence"));
809 assert!(
810 result.contains("repeated") || result.contains("pattern"),
811 "Should indicate compression occurred"
812 );
813 }
814
815 #[test]
816 fn test_repetitive_line_compression() {
817 let compressor = SemanticCompressor::new();
818 let content = "same line\n".repeat(100);
820 let result = compressor.compress(&content).unwrap();
821
822 assert!(
824 result.len() < content.len() / 2,
825 "Compressed size {} should be less than half of original {}",
826 result.len(),
827 content.len()
828 );
829 }
830
831 #[test]
832 fn test_non_repetitive_content_unchanged() {
833 let compressor = SemanticCompressor::new();
834 let content = "This is some unique content that does not repeat.";
836 let result = compressor.compress(content).unwrap();
837
838 assert_eq!(result, content);
840 }
841
842 #[test]
843 fn test_repetitive_with_variation() {
844 let compressor = SemanticCompressor::with_config(SemanticConfig {
845 budget_ratio: 0.3,
846 ..Default::default()
847 });
848
849 let mut content = String::new();
851 for i in 0..50 {
852 content.push_str(&format!("item {} ", i % 5)); }
854
855 let result = compressor.compress(&content).unwrap();
856 assert!(!result.is_empty());
859 }
860}