1use serde::{Deserialize, Serialize};
7use std::collections::hash_map::DefaultHasher;
8use std::collections::HashMap;
9use std::hash::{Hash, Hasher};
10
11pub const MAX_EMBEDDING_TOKENS: usize = 8192;
18
19pub const MAX_CODE_PREVIEW_TOKENS: usize = 6000;
21
22pub const CHUNK_OVERLAP_TOKENS: usize = 200;
24
25#[derive(Debug, Clone)]
32pub struct SemanticPattern {
33 pub name: &'static str,
35 pub pattern: &'static str,
37}
38
39pub static SEMANTIC_PATTERNS: &[SemanticPattern] = &[
42 SemanticPattern {
44 name: "crud",
45 pattern: r"\b(create|read|update|delete|insert|select|save|load|fetch|store|persist|get|set|add|remove)\b",
46 },
47 SemanticPattern {
48 name: "validation",
49 pattern: r"\b(valid|validate|check|verify|assert|ensure|sanitize|normalize|parse|format)\b",
50 },
51 SemanticPattern {
52 name: "transform",
53 pattern: r"\b(convert|transform|map|reduce|filter|sort|merge|split|join|serialize|deserialize)\b",
54 },
55 SemanticPattern {
57 name: "error_handling",
58 pattern: r"\b(try|catch|except|raise|throw|error|exception|fail|panic)\b",
59 },
60 SemanticPattern {
61 name: "async_ops",
62 pattern: r"\b(async|await|promise|future|callback|then|concurrent|parallel|thread)\b",
63 },
64 SemanticPattern {
65 name: "iteration",
66 pattern: r"\b(for|while|loop|iterate|each|map|reduce|filter)\b",
67 },
68 SemanticPattern {
70 name: "api_endpoint",
71 pattern: r"\b(route|endpoint|handler|controller|get|post|put|delete|patch|request|response)\b",
72 },
73 SemanticPattern {
74 name: "database",
75 pattern: r"\b(query|sql|select|insert|update|delete|table|schema|migration|model|entity)\b",
76 },
77 SemanticPattern {
78 name: "auth",
79 pattern: r"\b(auth|login|logout|session|token|jwt|oauth|permission|role|access)\b",
80 },
81 SemanticPattern {
82 name: "cache",
83 pattern: r"\b(cache|memoize|memo|store|redis|memcache|ttl|expire|invalidate)\b",
84 },
85 SemanticPattern {
87 name: "test",
88 pattern: r"\b(test|spec|mock|stub|assert|expect|should|describe|it)\b",
89 },
90 SemanticPattern {
91 name: "logging",
92 pattern: r"\b(log|logger|debug|info|warn|error|trace|print|console)\b",
93 },
94 SemanticPattern {
95 name: "config",
96 pattern: r"\b(config|setting|option|env|environment|parameter|argument)\b",
97 },
98];
99
100#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
106#[serde(rename_all = "lowercase")]
107pub enum UnitKind {
108 Function,
110 Method,
112 Class,
114 Module,
116 Chunk,
118}
119
120impl UnitKind {
121 #[must_use]
123 pub fn as_str(&self) -> &'static str {
124 match self {
125 Self::Function => "function",
126 Self::Method => "method",
127 Self::Class => "class",
128 Self::Module => "module",
129 Self::Chunk => "chunk",
130 }
131 }
132}
133
134impl std::fmt::Display for UnitKind {
135 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
136 write!(f, "{}", self.as_str())
137 }
138}
139
140#[derive(Debug, Clone, Default, Serialize, Deserialize)]
142pub struct CodeComplexity {
143 pub depth: usize,
145 pub branches: usize,
147 pub loops: usize,
149}
150
151impl CodeComplexity {
152 #[must_use]
154 pub fn empty() -> Self {
155 Self::default()
156 }
157
158 #[must_use]
160 pub fn is_complex(&self) -> bool {
161 self.depth > 3 || self.branches > 5 || self.loops > 2
162 }
163
164 #[must_use]
166 pub fn describe(&self) -> Option<String> {
167 let mut parts = Vec::new();
168 if self.depth > 3 {
169 parts.push("deep nesting");
170 }
171 if self.branches > 5 {
172 parts.push("many branches");
173 }
174 if self.loops > 2 {
175 parts.push("multiple loops");
176 }
177 if parts.is_empty() {
178 None
179 } else {
180 Some(parts.join(", "))
181 }
182 }
183}
184
185#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct EmbeddingUnit {
191 pub id: String,
193
194 pub file: String,
196
197 pub name: String,
199
200 pub kind: UnitKind,
202
203 pub code: String,
205
206 pub signature: String,
208
209 pub docstring: Option<String>,
211
212 pub start_line: usize,
214
215 pub end_line: usize,
217
218 pub token_count: usize,
220
221 pub semantic_tags: Vec<String>,
223
224 pub parent: Option<String>,
226
227 pub language: String,
232
233 #[serde(default, skip_serializing_if = "Vec::is_empty")]
235 pub calls: Vec<String>,
236
237 #[serde(default, skip_serializing_if = "Vec::is_empty")]
239 pub called_by: Vec<String>,
240
241 #[serde(default, skip_serializing_if = "String::is_empty")]
243 pub cfg_summary: String,
244
245 #[serde(default, skip_serializing_if = "String::is_empty")]
247 pub dfg_summary: String,
248
249 #[serde(default, skip_serializing_if = "String::is_empty")]
251 pub dependencies: String,
252
253 #[serde(default)]
255 pub complexity: CodeComplexity,
256
257 #[serde(default)]
259 pub chunk_index: usize,
260
261 #[serde(default = "default_chunk_total")]
263 pub chunk_total: usize,
264}
265
266fn default_chunk_total() -> usize {
267 1
268}
269
270impl EmbeddingUnit {
271 #[must_use]
273 pub fn new(
274 file: impl Into<String>,
275 name: impl Into<String>,
276 kind: UnitKind,
277 code: impl Into<String>,
278 start_line: usize,
279 language: impl Into<String>,
280 ) -> Self {
281 let name = name.into();
282 let file = file.into();
283 let code = code.into();
284
285 Self {
286 id: format!("{}::{}", file, name),
287 file,
288 name,
289 kind,
290 code,
291 signature: String::new(),
292 docstring: None,
293 start_line,
294 end_line: start_line,
295 token_count: 0,
296 semantic_tags: Vec::new(),
297 parent: None,
298 language: language.into(),
299 calls: Vec::new(),
300 called_by: Vec::new(),
301 cfg_summary: String::new(),
302 dfg_summary: String::new(),
303 dependencies: String::new(),
304 complexity: CodeComplexity::default(),
305 chunk_index: 0,
306 chunk_total: 1,
307 }
308 }
309
310 #[must_use]
312 pub fn is_chunk(&self) -> bool {
313 self.chunk_total > 1
314 }
315
316 #[must_use]
318 pub fn needs_chunking(&self) -> bool {
319 self.token_count > MAX_EMBEDDING_TOKENS
320 }
321
322 #[must_use]
324 pub fn qualified_name(&self) -> String {
325 match &self.parent {
326 Some(parent) if self.kind == UnitKind::Method => {
327 format!("{}::{}.{}", self.file, parent, self.name)
328 }
329 _ => format!("{}::{}", self.file, self.name),
330 }
331 }
332
333 #[must_use]
335 pub fn to_map(&self) -> HashMap<String, serde_json::Value> {
336 serde_json::to_value(self)
337 .ok()
338 .and_then(|v| v.as_object().cloned())
339 .map(|m| m.into_iter().collect())
340 .unwrap_or_default()
341 }
342}
343
344#[derive(Debug, Clone, Serialize, Deserialize)]
346pub struct SearchResult {
347 pub unit: EmbeddingUnit,
349
350 pub score: f32,
352
353 #[serde(default, skip_serializing_if = "Vec::is_empty")]
355 pub highlights: Vec<String>,
356}
357
358impl SearchResult {
359 #[must_use]
361 pub fn new(unit: EmbeddingUnit, score: f32) -> Self {
362 Self {
363 unit,
364 score,
365 highlights: Vec::new(),
366 }
367 }
368
369 #[must_use]
371 pub fn with_highlights(unit: EmbeddingUnit, score: f32, highlights: Vec<String>) -> Self {
372 Self {
373 unit,
374 score,
375 highlights,
376 }
377 }
378}
379
380#[derive(Debug, Clone)]
382pub struct ChunkInfo {
383 pub text: String,
385 pub start_char: usize,
387 pub end_char: usize,
389}
390
391impl ChunkInfo {
392 #[must_use]
394 pub fn new(text: String, start_char: usize, end_char: usize) -> Self {
395 Self {
396 text,
397 start_char,
398 end_char,
399 }
400 }
401}
402
403#[derive(Debug, Clone, PartialEq, Eq)]
409pub struct CodeLocation {
410 pub file: String,
412 pub name: String,
414 pub line: usize,
416}
417
418impl CodeLocation {
419 #[must_use]
421 pub fn new(file: impl Into<String>, name: impl Into<String>, line: usize) -> Self {
422 Self {
423 file: file.into(),
424 name: name.into(),
425 line,
426 }
427 }
428}
429
430#[derive(Debug, Clone, Default)]
455pub struct ContentHashedIndex {
456 seen: HashMap<u64, CodeLocation>,
458 pub duplicates_found: usize,
460 pub unique_items: usize,
462}
463
464impl ContentHashedIndex {
465 #[must_use]
467 pub fn new() -> Self {
468 Self::default()
469 }
470
471 fn hash_content(content: &str) -> u64 {
481 let mut hasher = DefaultHasher::new();
482 let normalized: String = content
483 .lines()
484 .map(|l| l.trim())
485 .filter(|l| !l.is_empty())
486 .collect::<Vec<_>>()
487 .join("\n");
488 normalized.hash(&mut hasher);
489 hasher.finish()
490 }
491
492 #[must_use]
502 pub fn check_duplicate(&self, content: &str) -> Option<&CodeLocation> {
503 let hash = Self::hash_content(content);
504 self.seen.get(&hash)
505 }
506
507 pub fn add(
520 &mut self,
521 content: &str,
522 file: &str,
523 function_name: &str,
524 line: usize,
525 ) -> bool {
526 let hash = Self::hash_content(content);
527
528 if self.seen.contains_key(&hash) {
529 self.duplicates_found += 1;
530 false
531 } else {
532 self.seen.insert(
533 hash,
534 CodeLocation::new(file, function_name, line),
535 );
536 self.unique_items += 1;
537 true
538 }
539 }
540
541 #[must_use]
547 pub fn stats(&self) -> (usize, usize) {
548 (self.unique_items, self.duplicates_found)
549 }
550
551 #[must_use]
553 pub fn len(&self) -> usize {
554 self.seen.len()
555 }
556
557 #[must_use]
559 pub fn is_empty(&self) -> bool {
560 self.seen.is_empty()
561 }
562
563 pub fn clear(&mut self) {
565 self.seen.clear();
566 self.duplicates_found = 0;
567 self.unique_items = 0;
568 }
569
570 #[must_use]
574 pub fn dedup_ratio(&self) -> f64 {
575 let total = self.unique_items + self.duplicates_found;
576 if total == 0 {
577 0.0
578 } else {
579 self.duplicates_found as f64 / total as f64
580 }
581 }
582}
583
584#[cfg(test)]
589mod tests {
590 use super::*;
591
592 #[test]
593 fn test_unit_kind_as_str() {
594 assert_eq!(UnitKind::Function.as_str(), "function");
595 assert_eq!(UnitKind::Method.as_str(), "method");
596 assert_eq!(UnitKind::Class.as_str(), "class");
597 assert_eq!(UnitKind::Module.as_str(), "module");
598 assert_eq!(UnitKind::Chunk.as_str(), "chunk");
599 }
600
601 #[test]
602 fn test_unit_kind_display() {
603 assert_eq!(format!("{}", UnitKind::Function), "function");
604 }
605
606 #[test]
607 fn test_code_complexity_describe() {
608 let simple = CodeComplexity {
609 depth: 2,
610 branches: 3,
611 loops: 1,
612 };
613 assert!(simple.describe().is_none());
614
615 let complex = CodeComplexity {
616 depth: 5,
617 branches: 10,
618 loops: 4,
619 };
620 let desc = complex.describe().unwrap();
621 assert!(desc.contains("deep nesting"));
622 assert!(desc.contains("many branches"));
623 assert!(desc.contains("multiple loops"));
624 }
625
626 #[test]
627 fn test_embedding_unit_new() {
628 let unit = EmbeddingUnit::new(
629 "src/main.py",
630 "process_data",
631 UnitKind::Function,
632 "def process_data(): pass",
633 10,
634 "python",
635 );
636
637 assert_eq!(unit.id, "src/main.py::process_data");
638 assert_eq!(unit.file, "src/main.py");
639 assert_eq!(unit.name, "process_data");
640 assert_eq!(unit.kind, UnitKind::Function);
641 assert_eq!(unit.start_line, 10);
642 assert_eq!(unit.language, "python");
643 assert!(!unit.is_chunk());
644 }
645
646 #[test]
647 fn test_embedding_unit_qualified_name() {
648 let mut unit = EmbeddingUnit::new(
649 "src/model.py",
650 "save",
651 UnitKind::Method,
652 "def save(self): pass",
653 20,
654 "python",
655 );
656 unit.parent = Some("User".to_string());
657
658 assert_eq!(unit.qualified_name(), "src/model.py::User.save");
659 }
660
661 #[test]
662 fn test_embedding_unit_is_chunk() {
663 let mut unit = EmbeddingUnit::new(
664 "src/large.py",
665 "big_function[1/3]",
666 UnitKind::Chunk,
667 "# chunk 1",
668 1,
669 "python",
670 );
671 unit.chunk_index = 0;
672 unit.chunk_total = 3;
673
674 assert!(unit.is_chunk());
675 }
676
677 #[test]
678 fn test_search_result() {
679 let unit = EmbeddingUnit::new(
680 "test.py",
681 "test_fn",
682 UnitKind::Function,
683 "def test_fn(): pass",
684 1,
685 "python",
686 );
687 let result = SearchResult::new(unit.clone(), 0.95);
688
689 assert_eq!(result.score, 0.95);
690 assert!(result.highlights.is_empty());
691
692 let result_with_highlights =
693 SearchResult::with_highlights(unit, 0.95, vec!["highlighted text".to_string()]);
694 assert_eq!(result_with_highlights.highlights.len(), 1);
695 }
696
697 #[test]
698 fn test_semantic_patterns_defined() {
699 assert!(!SEMANTIC_PATTERNS.is_empty());
700
701 let pattern_names: Vec<_> = SEMANTIC_PATTERNS.iter().map(|p| p.name).collect();
703 assert!(pattern_names.contains(&"crud"));
704 assert!(pattern_names.contains(&"validation"));
705 assert!(pattern_names.contains(&"error_handling"));
706 assert!(pattern_names.contains(&"async_ops"));
707 }
708
709 #[test]
710 fn test_constants() {
711 assert!(MAX_EMBEDDING_TOKENS > 0);
712 assert!(MAX_CODE_PREVIEW_TOKENS < MAX_EMBEDDING_TOKENS);
713 assert!(CHUNK_OVERLAP_TOKENS < MAX_CODE_PREVIEW_TOKENS);
714 }
715
716 #[test]
717 fn test_code_location_new() {
718 let loc = CodeLocation::new("src/main.py", "process", 42);
719 assert_eq!(loc.file, "src/main.py");
720 assert_eq!(loc.name, "process");
721 assert_eq!(loc.line, 42);
722 }
723
724 #[test]
725 fn test_content_hashed_index_new() {
726 let index = ContentHashedIndex::new();
727 assert!(index.is_empty());
728 assert_eq!(index.len(), 0);
729 assert_eq!(index.unique_items, 0);
730 assert_eq!(index.duplicates_found, 0);
731 }
732
733 #[test]
734 fn test_content_hashed_index_add_unique() {
735 let mut index = ContentHashedIndex::new();
736
737 assert!(index.add("def foo(): pass", "src/a.py", "foo", 10));
739 assert_eq!(index.unique_items, 1);
740 assert_eq!(index.duplicates_found, 0);
741 assert_eq!(index.len(), 1);
742
743 assert!(index.add("def bar(): return 1", "src/b.py", "bar", 20));
745 assert_eq!(index.unique_items, 2);
746 assert_eq!(index.duplicates_found, 0);
747 assert_eq!(index.len(), 2);
748 }
749
750 #[test]
751 fn test_content_hashed_index_detect_duplicate() {
752 let mut index = ContentHashedIndex::new();
753
754 assert!(index.add("def foo(): pass", "src/a.py", "foo", 10));
756
757 assert!(!index.add("def foo(): pass", "src/b.py", "foo", 20));
759 assert_eq!(index.unique_items, 1);
760 assert_eq!(index.duplicates_found, 1);
761 }
762
763 #[test]
764 fn test_content_hashed_index_whitespace_normalization() {
765 let mut index = ContentHashedIndex::new();
766
767 let code1 = "def foo():\n return 1";
769 assert!(index.add(code1, "src/a.py", "foo", 10));
770
771 let code2 = " def foo():\n return 1 ";
773 assert!(!index.add(code2, "src/b.py", "foo", 20));
774
775 let code3 = "def foo():\n\n return 1\n\n";
777 assert!(!index.add(code3, "src/c.py", "foo", 30));
778
779 assert_eq!(index.unique_items, 1);
780 assert_eq!(index.duplicates_found, 2);
781 }
782
783 #[test]
784 fn test_content_hashed_index_check_duplicate() {
785 let mut index = ContentHashedIndex::new();
786
787 assert!(index.check_duplicate("def foo(): pass").is_none());
789
790 index.add("def foo(): pass", "src/a.py", "foo", 10);
792
793 let loc = index.check_duplicate("def foo(): pass").unwrap();
795 assert_eq!(loc.file, "src/a.py");
796 assert_eq!(loc.name, "foo");
797 assert_eq!(loc.line, 10);
798 }
799
800 #[test]
801 fn test_content_hashed_index_stats() {
802 let mut index = ContentHashedIndex::new();
803
804 index.add("code1", "f1.py", "fn1", 1);
805 index.add("code2", "f2.py", "fn2", 2);
806 index.add("code1", "f3.py", "fn1", 3); index.add("code3", "f4.py", "fn3", 4);
808 index.add("code2", "f5.py", "fn2", 5); let (unique, dups) = index.stats();
811 assert_eq!(unique, 3);
812 assert_eq!(dups, 2);
813 }
814
815 #[test]
816 fn test_content_hashed_index_dedup_ratio() {
817 let mut index = ContentHashedIndex::new();
818
819 assert_eq!(index.dedup_ratio(), 0.0);
821
822 index.add("code1", "f1.py", "fn1", 1);
824 index.add("code2", "f2.py", "fn2", 2);
825 index.add("code1", "f3.py", "fn1", 3);
826 index.add("code3", "f4.py", "fn3", 4);
827 index.add("code2", "f5.py", "fn2", 5);
828
829 assert!((index.dedup_ratio() - 0.4).abs() < 0.001);
830 }
831
832 #[test]
833 fn test_content_hashed_index_clear() {
834 let mut index = ContentHashedIndex::new();
835
836 index.add("code1", "f1.py", "fn1", 1);
837 index.add("code1", "f2.py", "fn1", 2);
838
839 assert!(!index.is_empty());
840 assert_eq!(index.unique_items, 1);
841 assert_eq!(index.duplicates_found, 1);
842
843 index.clear();
844
845 assert!(index.is_empty());
846 assert_eq!(index.unique_items, 0);
847 assert_eq!(index.duplicates_found, 0);
848
849 assert!(index.add("code1", "f1.py", "fn1", 1));
851 }
852}