1#[cfg(feature = "embeddings")]
33use std::collections::HashMap;
34
35pub type Result<T> = std::result::Result<T, SemanticError>;
37
38#[derive(Debug, thiserror::Error)]
40pub enum SemanticError {
41 #[error("Model loading failed: {0}")]
42 ModelLoadError(String),
43
44 #[error("Embedding generation failed: {0}")]
45 EmbeddingError(String),
46
47 #[error("Clustering failed: {0}")]
48 ClusteringError(String),
49
50 #[error("Feature not available: embeddings feature not enabled")]
51 FeatureNotEnabled,
52}
53
54#[derive(Debug)]
64pub struct SemanticAnalyzer {
65 #[cfg(feature = "embeddings")]
67 model_path: Option<String>,
68 #[cfg(not(feature = "embeddings"))]
70 _model_path: Option<String>,
71}
72
73impl SemanticAnalyzer {
74 pub fn new() -> Self {
76 Self {
77 #[cfg(feature = "embeddings")]
78 model_path: None,
79 #[cfg(not(feature = "embeddings"))]
80 _model_path: None,
81 }
82 }
83
84 pub fn with_model(model_path: &str) -> Self {
89 Self {
90 #[cfg(feature = "embeddings")]
91 model_path: Some(model_path.to_owned()),
92 #[cfg(not(feature = "embeddings"))]
93 _model_path: Some(model_path.to_owned()),
94 }
95 }
96
97 #[cfg(feature = "embeddings")]
99 pub fn model_path(&self) -> Option<&str> {
100 self.model_path.as_deref()
101 }
102
103 #[cfg(feature = "embeddings")]
121 pub fn embed(&self, content: &str) -> Result<Vec<f32>> {
122 let mut embedding = vec![0.0f32; 384];
124 for (i, c) in content.chars().enumerate() {
125 let idx = (c as usize) % 384;
126 embedding[idx] += 1.0 / ((i + 1) as f32);
128 }
129 let norm: f32 = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
131 if norm > 0.0 {
132 for x in &mut embedding {
133 *x /= norm;
134 }
135 }
136 Ok(embedding)
137 }
138
139 #[cfg(not(feature = "embeddings"))]
141 pub fn embed(&self, _content: &str) -> Result<Vec<f32>> {
142 Ok(vec![0.0; 384])
143 }
144
145 #[cfg(feature = "embeddings")]
147 pub fn similarity(&self, a: &str, b: &str) -> Result<f32> {
148 let emb_a = self.embed(a)?;
149 let emb_b = self.embed(b)?;
150 Ok(cosine_similarity(&emb_a, &emb_b))
151 }
152
153 #[cfg(not(feature = "embeddings"))]
155 pub fn similarity(&self, _a: &str, _b: &str) -> Result<f32> {
156 Ok(0.0)
157 }
158}
159
160impl Default for SemanticAnalyzer {
161 fn default() -> Self {
162 Self::new()
163 }
164}
165
166#[derive(Debug, Clone)]
172pub struct SemanticConfig {
173 pub similarity_threshold: f32,
175 pub min_chunk_size: usize,
177 pub max_chunk_size: usize,
179 pub budget_ratio: f32,
181}
182
183impl Default for SemanticConfig {
184 fn default() -> Self {
185 Self {
186 similarity_threshold: 0.7,
187 min_chunk_size: 100,
188 max_chunk_size: 2000,
189 budget_ratio: 0.5,
190 }
191 }
192}
193
194#[derive(Debug, Clone)]
196pub struct CodeChunk {
197 pub content: String,
199 pub start: usize,
201 pub end: usize,
203 pub embedding: Option<Vec<f32>>,
205 pub cluster_id: Option<usize>,
207}
208
209pub struct SemanticCompressor {
214 config: SemanticConfig,
215 analyzer: SemanticAnalyzer,
217}
218
219impl SemanticCompressor {
220 pub fn new() -> Self {
222 Self::with_config(SemanticConfig::default())
223 }
224
225 pub fn with_config(config: SemanticConfig) -> Self {
227 Self { config, analyzer: SemanticAnalyzer::new() }
228 }
229
230 pub fn analyzer(&self) -> &SemanticAnalyzer {
235 &self.analyzer
236 }
237
238 pub fn compress(&self, content: &str) -> Result<String> {
245 #[cfg(feature = "embeddings")]
246 {
247 return self.compress_with_embeddings(content);
248 }
249
250 #[cfg(not(feature = "embeddings"))]
251 {
252 self.compress_heuristic(content)
253 }
254 }
255
256 fn split_into_chunks(&self, content: &str) -> Vec<CodeChunk> {
258 let mut chunks = Vec::new();
259 let mut current_start = 0;
260
261 for (i, _) in content.match_indices("\n\n") {
263 if i > current_start && i - current_start >= self.config.min_chunk_size {
264 let chunk_content = &content[current_start..i];
265 if chunk_content.len() <= self.config.max_chunk_size {
266 chunks.push(CodeChunk {
267 content: chunk_content.to_owned(),
268 start: current_start,
269 end: i,
270 embedding: None,
271 cluster_id: None,
272 });
273 }
274 current_start = i + 2;
275 }
276 }
277
278 if current_start < content.len() {
280 let remaining = &content[current_start..];
281 if remaining.len() >= self.config.min_chunk_size {
282 chunks.push(CodeChunk {
283 content: remaining.to_owned(),
284 start: current_start,
285 end: content.len(),
286 embedding: None,
287 cluster_id: None,
288 });
289 }
290 }
291
292 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
294 current_start = 0;
295 for (i, _) in content.match_indices('\n') {
296 if i > current_start && i - current_start >= self.config.min_chunk_size {
297 let chunk_content = &content[current_start..i];
298 if chunk_content.len() <= self.config.max_chunk_size {
299 chunks.push(CodeChunk {
300 content: chunk_content.to_owned(),
301 start: current_start,
302 end: i,
303 embedding: None,
304 cluster_id: None,
305 });
306 }
307 current_start = i + 1;
308 }
309 }
310 if current_start < content.len() {
312 let remaining = &content[current_start..];
313 if remaining.len() >= self.config.min_chunk_size {
314 chunks.push(CodeChunk {
315 content: remaining.to_owned(),
316 start: current_start,
317 end: content.len(),
318 embedding: None,
319 cluster_id: None,
320 });
321 }
322 }
323 }
324
325 if chunks.is_empty() && content.len() >= self.config.min_chunk_size {
327 current_start = 0;
328 for (i, _) in content.match_indices(". ") {
329 if i > current_start && i - current_start >= self.config.min_chunk_size {
330 let chunk_content = &content[current_start..=i]; if chunk_content.len() <= self.config.max_chunk_size {
332 chunks.push(CodeChunk {
333 content: chunk_content.to_owned(),
334 start: current_start,
335 end: i + 1,
336 embedding: None,
337 cluster_id: None,
338 });
339 }
340 current_start = i + 2;
341 }
342 }
343 if current_start < content.len() {
345 let remaining = &content[current_start..];
346 if remaining.len() >= self.config.min_chunk_size {
347 chunks.push(CodeChunk {
348 content: remaining.to_owned(),
349 start: current_start,
350 end: content.len(),
351 embedding: None,
352 cluster_id: None,
353 });
354 }
355 }
356 }
357
358 if chunks.is_empty() && content.len() > self.config.max_chunk_size {
360 let mut pos = 0;
361 while pos < content.len() {
362 let end = (pos + self.config.max_chunk_size).min(content.len());
363 chunks.push(CodeChunk {
364 content: content[pos..end].to_owned(),
365 start: pos,
366 end,
367 embedding: None,
368 cluster_id: None,
369 });
370 pos = end;
371 }
372 }
373
374 chunks
375 }
376
377 fn compress_heuristic(&self, content: &str) -> Result<String> {
379 let chunks = self.split_into_chunks(content);
380
381 if chunks.is_empty() {
382 return Ok(content.to_owned());
383 }
384
385 let target_chunks = ((chunks.len() as f32) * self.config.budget_ratio).ceil() as usize;
387 let step = chunks.len() / target_chunks.max(1);
388
389 let mut result = String::new();
390 let mut kept = 0;
391
392 for (i, chunk) in chunks.iter().enumerate() {
393 if i % step.max(1) == 0 && kept < target_chunks {
394 if !result.is_empty() {
395 result.push_str("\n\n");
396 }
397 result.push_str(&chunk.content);
398 kept += 1;
399 }
400 }
401
402 if kept < chunks.len() {
404 result.push_str(&format!(
405 "\n\n/* ... {} chunks compressed ({:.0}% of original) ... */",
406 chunks.len() - kept,
407 (kept as f32 / chunks.len() as f32) * 100.0
408 ));
409 }
410
411 Ok(result)
412 }
413
414 #[cfg(feature = "embeddings")]
416 fn compress_with_embeddings(&self, content: &str) -> Result<String> {
417 let mut chunks = self.split_into_chunks(content);
418
419 if chunks.is_empty() {
420 return Ok(content.to_owned());
421 }
422
423 for chunk in &mut chunks {
425 chunk.embedding = Some(self.analyzer.embed(&chunk.content)?);
426 }
427
428 let clusters = self.cluster_chunks(&chunks)?;
430
431 let mut result = String::new();
433 for cluster in clusters.values() {
434 if let Some(representative) = self.select_representative(cluster) {
435 if !result.is_empty() {
436 result.push_str("\n\n");
437 }
438 result.push_str(&representative.content);
439 }
440 }
441
442 Ok(result)
443 }
444
445 #[cfg(feature = "embeddings")]
447 fn cluster_chunks<'a>(
448 &self,
449 chunks: &'a [CodeChunk],
450 ) -> Result<HashMap<usize, Vec<&'a CodeChunk>>> {
451 let mut clusters: HashMap<usize, Vec<&CodeChunk>> = HashMap::new();
452 let mut next_cluster = 0;
453
454 for chunk in chunks {
455 let embedding = chunk
456 .embedding
457 .as_ref()
458 .ok_or_else(|| SemanticError::ClusteringError("Missing embedding".into()))?;
459
460 let mut assigned = false;
462 for (&cluster_id, cluster_chunks) in &clusters {
463 if let Some(first) = cluster_chunks.first() {
464 if let Some(ref first_emb) = first.embedding {
465 let similarity = cosine_similarity(embedding, first_emb);
466 if similarity >= self.config.similarity_threshold {
467 clusters.get_mut(&cluster_id).unwrap().push(chunk);
468 assigned = true;
469 break;
470 }
471 }
472 }
473 }
474
475 if !assigned {
476 clusters.insert(next_cluster, vec![chunk]);
477 next_cluster += 1;
478 }
479 }
480
481 Ok(clusters)
482 }
483
484 #[cfg(feature = "embeddings")]
486 fn select_representative<'a>(&self, chunks: &[&'a CodeChunk]) -> Option<&'a CodeChunk> {
487 chunks.iter().max_by_key(|c| c.content.len()).copied()
489 }
490}
491
492impl Default for SemanticCompressor {
493 fn default() -> Self {
494 Self::new()
495 }
496}
497
498pub type CharacterFrequencyAnalyzer = SemanticAnalyzer;
511
512pub type HeuristicCompressor = SemanticCompressor;
518
519pub type HeuristicCompressionConfig = SemanticConfig;
521
522#[cfg_attr(not(feature = "embeddings"), allow(dead_code))]
537fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
538 if a.len() != b.len() || a.is_empty() {
539 return 0.0;
540 }
541
542 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
543 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
544 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
545
546 if norm_a == 0.0 || norm_b == 0.0 {
547 return 0.0;
548 }
549
550 dot / (norm_a * norm_b)
551}
552
553#[cfg(test)]
558mod tests {
559 use super::*;
560
561 #[test]
562 fn test_analyzer_creation() {
563 let analyzer = SemanticAnalyzer::new();
564 #[cfg(feature = "embeddings")]
567 assert!(analyzer.model_path().is_none());
568 #[cfg(not(feature = "embeddings"))]
569 drop(analyzer); }
571
572 #[test]
573 fn test_analyzer_with_model() {
574 let analyzer = SemanticAnalyzer::with_model("/path/to/model");
575 #[cfg(feature = "embeddings")]
576 assert_eq!(analyzer.model_path(), Some("/path/to/model"));
577 #[cfg(not(feature = "embeddings"))]
578 drop(analyzer); }
580
581 #[test]
582 fn test_compressor_analyzer_access() {
583 let compressor = SemanticCompressor::new();
584 let _analyzer = compressor.analyzer();
586 }
587
588 #[test]
589 fn test_semantic_config_default() {
590 let config = SemanticConfig::default();
591 assert_eq!(config.similarity_threshold, 0.7);
592 assert_eq!(config.budget_ratio, 0.5);
593 }
594
595 #[test]
596 fn test_split_into_chunks() {
597 let compressor = SemanticCompressor::with_config(SemanticConfig {
598 min_chunk_size: 10,
599 max_chunk_size: 1000,
600 ..Default::default()
601 });
602
603 let content = "First chunk here\n\nSecond chunk here\n\nThird chunk";
604 let chunks = compressor.split_into_chunks(content);
605 assert!(chunks.len() >= 2);
606 }
607
608 #[test]
609 fn test_heuristic_compression() {
610 let compressor = SemanticCompressor::with_config(SemanticConfig {
611 min_chunk_size: 5,
612 max_chunk_size: 100,
613 budget_ratio: 0.5,
614 ..Default::default()
615 });
616
617 let content = "Chunk 1\n\nChunk 2\n\nChunk 3\n\nChunk 4";
618 let result = compressor.compress_heuristic(content).unwrap();
619 assert!(!result.is_empty() || content.is_empty());
621 }
622
623 #[test]
624 fn test_empty_content() {
625 let compressor = SemanticCompressor::new();
626 let result = compressor.compress("").unwrap();
627 assert_eq!(result, "");
628 }
629
630 #[test]
631 fn test_cosine_similarity_identical() {
632 let a = vec![1.0, 0.0, 0.0];
633 let b = vec![1.0, 0.0, 0.0];
634 let sim = cosine_similarity(&a, &b);
635 assert!((sim - 1.0).abs() < 0.001);
636 }
637
638 #[test]
639 fn test_cosine_similarity_orthogonal() {
640 let a = vec![1.0, 0.0, 0.0];
641 let c = vec![0.0, 1.0, 0.0];
642 let sim = cosine_similarity(&a, &c);
643 assert!(sim.abs() < 0.001);
644 }
645
646 #[test]
647 fn test_cosine_similarity_empty() {
648 let a: Vec<f32> = vec![];
649 let b: Vec<f32> = vec![];
650 assert_eq!(cosine_similarity(&a, &b), 0.0);
651 }
652}