go_brrr/semantic/
types.rs

1//! Semantic search type definitions.
2//!
3//! Core data structures for semantic code search and embedding.
4//! Mirrors the Python implementation in brrr/semantic.py.
5
6use serde::{Deserialize, Serialize};
7use std::collections::hash_map::DefaultHasher;
8use std::collections::HashMap;
9use std::hash::{Hash, Hasher};
10
11// =============================================================================
12// Token Budget Constants
13// =============================================================================
14
15/// Maximum tokens for embedding text (conservative limit for good retrieval quality).
16/// Qwen3 supports 32K, TEI configured for 16K, but 8K gives best results.
17pub const MAX_EMBEDDING_TOKENS: usize = 8192;
18
19/// Maximum tokens for code preview, leaving room for metadata in embedding text.
20pub const MAX_CODE_PREVIEW_TOKENS: usize = 6000;
21
22/// Token overlap between chunks for context continuity.
23pub const CHUNK_OVERLAP_TOKENS: usize = 200;
24
25// =============================================================================
26// Semantic Pattern Definitions
27// =============================================================================
28
29/// Semantic pattern categories for automatic code tagging.
30/// Each pattern is a regex that matches code containing that semantic concept.
31#[derive(Debug, Clone)]
32pub struct SemanticPattern {
33    /// Pattern category name (e.g., "crud", "validation")
34    pub name: &'static str,
35    /// Regex pattern to match
36    pub pattern: &'static str,
37}
38
39/// All semantic patterns for code tagging.
40/// These patterns detect common code categories for semantic enrichment.
41pub static SEMANTIC_PATTERNS: &[SemanticPattern] = &[
42    // Data operations
43    SemanticPattern {
44        name: "crud",
45        pattern: r"\b(create|read|update|delete|insert|select|save|load|fetch|store|persist|get|set|add|remove)\b",
46    },
47    SemanticPattern {
48        name: "validation",
49        pattern: r"\b(valid|validate|check|verify|assert|ensure|sanitize|normalize|parse|format)\b",
50    },
51    SemanticPattern {
52        name: "transform",
53        pattern: r"\b(convert|transform|map|reduce|filter|sort|merge|split|join|serialize|deserialize)\b",
54    },
55    // Control flow patterns
56    SemanticPattern {
57        name: "error_handling",
58        pattern: r"\b(try|catch|except|raise|throw|error|exception|fail|panic)\b",
59    },
60    SemanticPattern {
61        name: "async_ops",
62        pattern: r"\b(async|await|promise|future|callback|then|concurrent|parallel|thread)\b",
63    },
64    SemanticPattern {
65        name: "iteration",
66        pattern: r"\b(for|while|loop|iterate|each|map|reduce|filter)\b",
67    },
68    // Architecture patterns
69    SemanticPattern {
70        name: "api_endpoint",
71        pattern: r"\b(route|endpoint|handler|controller|get|post|put|delete|patch|request|response)\b",
72    },
73    SemanticPattern {
74        name: "database",
75        pattern: r"\b(query|sql|select|insert|update|delete|table|schema|migration|model|entity)\b",
76    },
77    SemanticPattern {
78        name: "auth",
79        pattern: r"\b(auth|login|logout|session|token|jwt|oauth|permission|role|access)\b",
80    },
81    SemanticPattern {
82        name: "cache",
83        pattern: r"\b(cache|memoize|memo|store|redis|memcache|ttl|expire|invalidate)\b",
84    },
85    // Code quality
86    SemanticPattern {
87        name: "test",
88        pattern: r"\b(test|spec|mock|stub|assert|expect|should|describe|it)\b",
89    },
90    SemanticPattern {
91        name: "logging",
92        pattern: r"\b(log|logger|debug|info|warn|error|trace|print|console)\b",
93    },
94    SemanticPattern {
95        name: "config",
96        pattern: r"\b(config|setting|option|env|environment|parameter|argument)\b",
97    },
98];
99
100// =============================================================================
101// Core Types
102// =============================================================================
103
104/// Kind of code unit for embedding.
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
106#[serde(rename_all = "lowercase")]
107pub enum UnitKind {
108    /// Top-level function
109    Function,
110    /// Class method
111    Method,
112    /// Class or struct definition
113    Class,
114    /// Module-level code or file summary
115    Module,
116    /// Chunk of an oversized unit
117    Chunk,
118}
119
120impl UnitKind {
121    /// Convert to string representation.
122    #[must_use]
123    pub fn as_str(&self) -> &'static str {
124        match self {
125            Self::Function => "function",
126            Self::Method => "method",
127            Self::Class => "class",
128            Self::Module => "module",
129            Self::Chunk => "chunk",
130        }
131    }
132}
133
134impl std::fmt::Display for UnitKind {
135    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
136        write!(f, "{}", self.as_str())
137    }
138}
139
140/// Code complexity metrics (heuristic analysis without full AST parsing).
141#[derive(Debug, Clone, Default, Serialize, Deserialize)]
142pub struct CodeComplexity {
143    /// Maximum nesting depth
144    pub depth: usize,
145    /// Number of branch statements (if, elif, else, case, switch, match)
146    pub branches: usize,
147    /// Number of loop statements (for, while, loop)
148    pub loops: usize,
149}
150
151impl CodeComplexity {
152    /// Create empty complexity metrics.
153    #[must_use]
154    pub fn empty() -> Self {
155        Self::default()
156    }
157
158    /// Check if the code has notable complexity.
159    #[must_use]
160    pub fn is_complex(&self) -> bool {
161        self.depth > 3 || self.branches > 5 || self.loops > 2
162    }
163
164    /// Generate a natural language description of complexity.
165    #[must_use]
166    pub fn describe(&self) -> Option<String> {
167        let mut parts = Vec::new();
168        if self.depth > 3 {
169            parts.push("deep nesting");
170        }
171        if self.branches > 5 {
172            parts.push("many branches");
173        }
174        if self.loops > 2 {
175            parts.push("multiple loops");
176        }
177        if parts.is_empty() {
178            None
179        } else {
180            Some(parts.join(", "))
181        }
182    }
183}
184
185/// A code unit for semantic embedding.
186///
187/// Contains information from all 5 brrr analysis layers plus semantic enrichment.
188/// This is the primary unit of indexing and retrieval in semantic search.
189#[derive(Debug, Clone, Serialize, Deserialize)]
190pub struct EmbeddingUnit {
191    /// Unique identifier (typically file::qualified_name or file::name#chunkN)
192    pub id: String,
193
194    /// Source file path (relative to project root)
195    pub file: String,
196
197    /// Simple name (function/class/method name)
198    pub name: String,
199
200    /// Kind of code unit
201    pub kind: UnitKind,
202
203    /// Full code content (may be truncated for large units)
204    pub code: String,
205
206    /// Function/method signature or class declaration
207    pub signature: String,
208
209    /// Docstring or documentation comment
210    pub docstring: Option<String>,
211
212    /// Starting line number (1-indexed)
213    pub start_line: usize,
214
215    /// Ending line number (1-indexed)
216    pub end_line: usize,
217
218    /// Token count for this unit's code
219    pub token_count: usize,
220
221    /// Semantic tags detected from code patterns
222    pub semantic_tags: Vec<String>,
223
224    /// Parent unit name (for chunks and methods)
225    pub parent: Option<String>,
226
227    // ==========================================================================
228    // Extended metadata from brrr layers
229    // ==========================================================================
230    /// L1: Programming language
231    pub language: String,
232
233    /// L2: Functions this unit calls
234    #[serde(default, skip_serializing_if = "Vec::is_empty")]
235    pub calls: Vec<String>,
236
237    /// L2: Functions that call this unit
238    #[serde(default, skip_serializing_if = "Vec::is_empty")]
239    pub called_by: Vec<String>,
240
241    /// L3: CFG summary (complexity, block count)
242    #[serde(default, skip_serializing_if = "String::is_empty")]
243    pub cfg_summary: String,
244
245    /// L4: DFG summary (variable count, def-use chains)
246    #[serde(default, skip_serializing_if = "String::is_empty")]
247    pub dfg_summary: String,
248
249    /// L5: Dependencies (imported modules)
250    #[serde(default, skip_serializing_if = "String::is_empty")]
251    pub dependencies: String,
252
253    /// Code complexity metrics
254    #[serde(default)]
255    pub complexity: CodeComplexity,
256
257    /// Chunk index (0-indexed, for chunked units)
258    #[serde(default)]
259    pub chunk_index: usize,
260
261    /// Total number of chunks (1 for non-chunked units)
262    #[serde(default = "default_chunk_total")]
263    pub chunk_total: usize,
264}
265
266fn default_chunk_total() -> usize {
267    1
268}
269
270impl EmbeddingUnit {
271    /// Create a new embedding unit with minimal required fields.
272    #[must_use]
273    pub fn new(
274        file: impl Into<String>,
275        name: impl Into<String>,
276        kind: UnitKind,
277        code: impl Into<String>,
278        start_line: usize,
279        language: impl Into<String>,
280    ) -> Self {
281        let name = name.into();
282        let file = file.into();
283        let code = code.into();
284
285        Self {
286            id: format!("{}::{}", file, name),
287            file,
288            name,
289            kind,
290            code,
291            signature: String::new(),
292            docstring: None,
293            start_line,
294            end_line: start_line,
295            token_count: 0,
296            semantic_tags: Vec::new(),
297            parent: None,
298            language: language.into(),
299            calls: Vec::new(),
300            called_by: Vec::new(),
301            cfg_summary: String::new(),
302            dfg_summary: String::new(),
303            dependencies: String::new(),
304            complexity: CodeComplexity::default(),
305            chunk_index: 0,
306            chunk_total: 1,
307        }
308    }
309
310    /// Check if this unit is a chunk of a larger unit.
311    #[must_use]
312    pub fn is_chunk(&self) -> bool {
313        self.chunk_total > 1
314    }
315
316    /// Check if this unit needs to be split into chunks based on token count.
317    #[must_use]
318    pub fn needs_chunking(&self) -> bool {
319        self.token_count > MAX_EMBEDDING_TOKENS
320    }
321
322    /// Get the qualified name (file::name or file::parent.name for methods).
323    #[must_use]
324    pub fn qualified_name(&self) -> String {
325        match &self.parent {
326            Some(parent) if self.kind == UnitKind::Method => {
327                format!("{}::{}.{}", self.file, parent, self.name)
328            }
329            _ => format!("{}::{}", self.file, self.name),
330        }
331    }
332
333    /// Convert to a HashMap for JSON serialization.
334    #[must_use]
335    pub fn to_map(&self) -> HashMap<String, serde_json::Value> {
336        serde_json::to_value(self)
337            .ok()
338            .and_then(|v| v.as_object().cloned())
339            .map(|m| m.into_iter().collect())
340            .unwrap_or_default()
341    }
342}
343
344/// Result of a semantic search query.
345#[derive(Debug, Clone, Serialize, Deserialize)]
346pub struct SearchResult {
347    /// The matching code unit
348    pub unit: EmbeddingUnit,
349
350    /// Similarity score (0.0 to 1.0, higher is better)
351    pub score: f32,
352
353    /// Highlighted portions of code that matched (optional)
354    #[serde(default, skip_serializing_if = "Vec::is_empty")]
355    pub highlights: Vec<String>,
356}
357
358impl SearchResult {
359    /// Create a new search result.
360    #[must_use]
361    pub fn new(unit: EmbeddingUnit, score: f32) -> Self {
362        Self {
363            unit,
364            score,
365            highlights: Vec::new(),
366        }
367    }
368
369    /// Create a search result with highlights.
370    #[must_use]
371    pub fn with_highlights(unit: EmbeddingUnit, score: f32, highlights: Vec<String>) -> Self {
372        Self {
373            unit,
374            score,
375            highlights,
376        }
377    }
378}
379
380/// Information about a chunk split from an oversized unit.
381#[derive(Debug, Clone)]
382pub struct ChunkInfo {
383    /// Chunk text content
384    pub text: String,
385    /// Start character offset in original code
386    pub start_char: usize,
387    /// End character offset in original code
388    pub end_char: usize,
389}
390
391impl ChunkInfo {
392    /// Create a new chunk info.
393    #[must_use]
394    pub fn new(text: String, start_char: usize, end_char: usize) -> Self {
395        Self {
396            text,
397            start_char,
398            end_char,
399        }
400    }
401}
402
403// =============================================================================
404// Content-Hashed Index for Deduplication
405// =============================================================================
406
407/// Location information for a code unit (file, function name, line).
408#[derive(Debug, Clone, PartialEq, Eq)]
409pub struct CodeLocation {
410    /// Source file path
411    pub file: String,
412    /// Function or code unit name
413    pub name: String,
414    /// Line number (1-indexed)
415    pub line: usize,
416}
417
418impl CodeLocation {
419    /// Create a new code location.
420    #[must_use]
421    pub fn new(file: impl Into<String>, name: impl Into<String>, line: usize) -> Self {
422        Self {
423            file: file.into(),
424            name: name.into(),
425            line,
426        }
427    }
428}
429
430/// Content-hashed index for deduplication of code units.
431///
432/// Used to avoid indexing identical code multiple times. The index normalizes
433/// whitespace before hashing, so code with different formatting but identical
434/// content will be detected as duplicates.
435///
436/// # Examples
437///
438/// ```
439/// use go_brrr::semantic::ContentHashedIndex;
440///
441/// let mut index = ContentHashedIndex::new();
442///
443/// // First occurrence is added
444/// assert!(index.add("def foo(): pass", "src/a.py", "foo", 10));
445///
446/// // Identical content is detected as duplicate
447/// assert!(!index.add("def foo(): pass", "src/b.py", "foo", 20));
448///
449/// // Check stats
450/// let (unique, duplicates) = index.stats();
451/// assert_eq!(unique, 1);
452/// assert_eq!(duplicates, 1);
453/// ```
454#[derive(Debug, Clone, Default)]
455pub struct ContentHashedIndex {
456    /// Hash -> original location (file, function_name, line)
457    seen: HashMap<u64, CodeLocation>,
458    /// Number of duplicate items detected
459    pub duplicates_found: usize,
460    /// Number of unique items indexed
461    pub unique_items: usize,
462}
463
464impl ContentHashedIndex {
465    /// Create a new empty content-hashed index.
466    #[must_use]
467    pub fn new() -> Self {
468        Self::default()
469    }
470
471    /// Compute content hash for code with whitespace normalization.
472    ///
473    /// Normalizes code by:
474    /// - Trimming each line
475    /// - Removing empty lines
476    /// - Joining with single newlines
477    ///
478    /// This ensures code with different indentation or blank lines
479    /// but identical content produces the same hash.
480    fn hash_content(content: &str) -> u64 {
481        let mut hasher = DefaultHasher::new();
482        let normalized: String = content
483            .lines()
484            .map(|l| l.trim())
485            .filter(|l| !l.is_empty())
486            .collect::<Vec<_>>()
487            .join("\n");
488        normalized.hash(&mut hasher);
489        hasher.finish()
490    }
491
492    /// Check if content is a duplicate, returning the original location if so.
493    ///
494    /// # Arguments
495    ///
496    /// * `content` - Code content to check
497    ///
498    /// # Returns
499    ///
500    /// `Some(&CodeLocation)` if this content was already seen, `None` otherwise.
501    #[must_use]
502    pub fn check_duplicate(&self, content: &str) -> Option<&CodeLocation> {
503        let hash = Self::hash_content(content);
504        self.seen.get(&hash)
505    }
506
507    /// Add content to the index.
508    ///
509    /// # Arguments
510    ///
511    /// * `content` - Code content to add
512    /// * `file` - Source file path
513    /// * `function_name` - Name of the function or code unit
514    /// * `line` - Line number (1-indexed)
515    ///
516    /// # Returns
517    ///
518    /// `true` if this is new content (was added), `false` if duplicate (was not added).
519    pub fn add(
520        &mut self,
521        content: &str,
522        file: &str,
523        function_name: &str,
524        line: usize,
525    ) -> bool {
526        let hash = Self::hash_content(content);
527
528        if self.seen.contains_key(&hash) {
529            self.duplicates_found += 1;
530            false
531        } else {
532            self.seen.insert(
533                hash,
534                CodeLocation::new(file, function_name, line),
535            );
536            self.unique_items += 1;
537            true
538        }
539    }
540
541    /// Get deduplication statistics.
542    ///
543    /// # Returns
544    ///
545    /// Tuple of (unique_items, duplicates_found).
546    #[must_use]
547    pub fn stats(&self) -> (usize, usize) {
548        (self.unique_items, self.duplicates_found)
549    }
550
551    /// Get the number of unique items in the index.
552    #[must_use]
553    pub fn len(&self) -> usize {
554        self.seen.len()
555    }
556
557    /// Check if the index is empty.
558    #[must_use]
559    pub fn is_empty(&self) -> bool {
560        self.seen.is_empty()
561    }
562
563    /// Clear the index and reset statistics.
564    pub fn clear(&mut self) {
565        self.seen.clear();
566        self.duplicates_found = 0;
567        self.unique_items = 0;
568    }
569
570    /// Get the deduplication ratio (duplicates / total).
571    ///
572    /// Returns 0.0 if no items have been processed.
573    #[must_use]
574    pub fn dedup_ratio(&self) -> f64 {
575        let total = self.unique_items + self.duplicates_found;
576        if total == 0 {
577            0.0
578        } else {
579            self.duplicates_found as f64 / total as f64
580        }
581    }
582}
583
584// =============================================================================
585// Tests
586// =============================================================================
587
588#[cfg(test)]
589mod tests {
590    use super::*;
591
592    #[test]
593    fn test_unit_kind_as_str() {
594        assert_eq!(UnitKind::Function.as_str(), "function");
595        assert_eq!(UnitKind::Method.as_str(), "method");
596        assert_eq!(UnitKind::Class.as_str(), "class");
597        assert_eq!(UnitKind::Module.as_str(), "module");
598        assert_eq!(UnitKind::Chunk.as_str(), "chunk");
599    }
600
601    #[test]
602    fn test_unit_kind_display() {
603        assert_eq!(format!("{}", UnitKind::Function), "function");
604    }
605
606    #[test]
607    fn test_code_complexity_describe() {
608        let simple = CodeComplexity {
609            depth: 2,
610            branches: 3,
611            loops: 1,
612        };
613        assert!(simple.describe().is_none());
614
615        let complex = CodeComplexity {
616            depth: 5,
617            branches: 10,
618            loops: 4,
619        };
620        let desc = complex.describe().unwrap();
621        assert!(desc.contains("deep nesting"));
622        assert!(desc.contains("many branches"));
623        assert!(desc.contains("multiple loops"));
624    }
625
626    #[test]
627    fn test_embedding_unit_new() {
628        let unit = EmbeddingUnit::new(
629            "src/main.py",
630            "process_data",
631            UnitKind::Function,
632            "def process_data(): pass",
633            10,
634            "python",
635        );
636
637        assert_eq!(unit.id, "src/main.py::process_data");
638        assert_eq!(unit.file, "src/main.py");
639        assert_eq!(unit.name, "process_data");
640        assert_eq!(unit.kind, UnitKind::Function);
641        assert_eq!(unit.start_line, 10);
642        assert_eq!(unit.language, "python");
643        assert!(!unit.is_chunk());
644    }
645
646    #[test]
647    fn test_embedding_unit_qualified_name() {
648        let mut unit = EmbeddingUnit::new(
649            "src/model.py",
650            "save",
651            UnitKind::Method,
652            "def save(self): pass",
653            20,
654            "python",
655        );
656        unit.parent = Some("User".to_string());
657
658        assert_eq!(unit.qualified_name(), "src/model.py::User.save");
659    }
660
661    #[test]
662    fn test_embedding_unit_is_chunk() {
663        let mut unit = EmbeddingUnit::new(
664            "src/large.py",
665            "big_function[1/3]",
666            UnitKind::Chunk,
667            "# chunk 1",
668            1,
669            "python",
670        );
671        unit.chunk_index = 0;
672        unit.chunk_total = 3;
673
674        assert!(unit.is_chunk());
675    }
676
677    #[test]
678    fn test_search_result() {
679        let unit = EmbeddingUnit::new(
680            "test.py",
681            "test_fn",
682            UnitKind::Function,
683            "def test_fn(): pass",
684            1,
685            "python",
686        );
687        let result = SearchResult::new(unit.clone(), 0.95);
688
689        assert_eq!(result.score, 0.95);
690        assert!(result.highlights.is_empty());
691
692        let result_with_highlights =
693            SearchResult::with_highlights(unit, 0.95, vec!["highlighted text".to_string()]);
694        assert_eq!(result_with_highlights.highlights.len(), 1);
695    }
696
697    #[test]
698    fn test_semantic_patterns_defined() {
699        assert!(!SEMANTIC_PATTERNS.is_empty());
700
701        // Check some key patterns exist
702        let pattern_names: Vec<_> = SEMANTIC_PATTERNS.iter().map(|p| p.name).collect();
703        assert!(pattern_names.contains(&"crud"));
704        assert!(pattern_names.contains(&"validation"));
705        assert!(pattern_names.contains(&"error_handling"));
706        assert!(pattern_names.contains(&"async_ops"));
707    }
708
709    #[test]
710    fn test_constants() {
711        assert!(MAX_EMBEDDING_TOKENS > 0);
712        assert!(MAX_CODE_PREVIEW_TOKENS < MAX_EMBEDDING_TOKENS);
713        assert!(CHUNK_OVERLAP_TOKENS < MAX_CODE_PREVIEW_TOKENS);
714    }
715
716    #[test]
717    fn test_code_location_new() {
718        let loc = CodeLocation::new("src/main.py", "process", 42);
719        assert_eq!(loc.file, "src/main.py");
720        assert_eq!(loc.name, "process");
721        assert_eq!(loc.line, 42);
722    }
723
724    #[test]
725    fn test_content_hashed_index_new() {
726        let index = ContentHashedIndex::new();
727        assert!(index.is_empty());
728        assert_eq!(index.len(), 0);
729        assert_eq!(index.unique_items, 0);
730        assert_eq!(index.duplicates_found, 0);
731    }
732
733    #[test]
734    fn test_content_hashed_index_add_unique() {
735        let mut index = ContentHashedIndex::new();
736
737        // First item should be added
738        assert!(index.add("def foo(): pass", "src/a.py", "foo", 10));
739        assert_eq!(index.unique_items, 1);
740        assert_eq!(index.duplicates_found, 0);
741        assert_eq!(index.len(), 1);
742
743        // Different content should also be added
744        assert!(index.add("def bar(): return 1", "src/b.py", "bar", 20));
745        assert_eq!(index.unique_items, 2);
746        assert_eq!(index.duplicates_found, 0);
747        assert_eq!(index.len(), 2);
748    }
749
750    #[test]
751    fn test_content_hashed_index_detect_duplicate() {
752        let mut index = ContentHashedIndex::new();
753
754        // Add first occurrence
755        assert!(index.add("def foo(): pass", "src/a.py", "foo", 10));
756
757        // Same content in different file is duplicate
758        assert!(!index.add("def foo(): pass", "src/b.py", "foo", 20));
759        assert_eq!(index.unique_items, 1);
760        assert_eq!(index.duplicates_found, 1);
761    }
762
763    #[test]
764    fn test_content_hashed_index_whitespace_normalization() {
765        let mut index = ContentHashedIndex::new();
766
767        // Add with specific indentation
768        let code1 = "def foo():\n    return 1";
769        assert!(index.add(code1, "src/a.py", "foo", 10));
770
771        // Same code with different indentation is duplicate
772        let code2 = "  def foo():\n        return 1  ";
773        assert!(!index.add(code2, "src/b.py", "foo", 20));
774
775        // Same code with extra blank lines is duplicate
776        let code3 = "def foo():\n\n    return 1\n\n";
777        assert!(!index.add(code3, "src/c.py", "foo", 30));
778
779        assert_eq!(index.unique_items, 1);
780        assert_eq!(index.duplicates_found, 2);
781    }
782
783    #[test]
784    fn test_content_hashed_index_check_duplicate() {
785        let mut index = ContentHashedIndex::new();
786
787        // Initially nothing is duplicate
788        assert!(index.check_duplicate("def foo(): pass").is_none());
789
790        // Add content
791        index.add("def foo(): pass", "src/a.py", "foo", 10);
792
793        // Now it should be detected
794        let loc = index.check_duplicate("def foo(): pass").unwrap();
795        assert_eq!(loc.file, "src/a.py");
796        assert_eq!(loc.name, "foo");
797        assert_eq!(loc.line, 10);
798    }
799
800    #[test]
801    fn test_content_hashed_index_stats() {
802        let mut index = ContentHashedIndex::new();
803
804        index.add("code1", "f1.py", "fn1", 1);
805        index.add("code2", "f2.py", "fn2", 2);
806        index.add("code1", "f3.py", "fn1", 3); // duplicate
807        index.add("code3", "f4.py", "fn3", 4);
808        index.add("code2", "f5.py", "fn2", 5); // duplicate
809
810        let (unique, dups) = index.stats();
811        assert_eq!(unique, 3);
812        assert_eq!(dups, 2);
813    }
814
815    #[test]
816    fn test_content_hashed_index_dedup_ratio() {
817        let mut index = ContentHashedIndex::new();
818
819        // Empty index has 0 ratio
820        assert_eq!(index.dedup_ratio(), 0.0);
821
822        // 3 unique, 2 duplicates = 2/5 = 0.4
823        index.add("code1", "f1.py", "fn1", 1);
824        index.add("code2", "f2.py", "fn2", 2);
825        index.add("code1", "f3.py", "fn1", 3);
826        index.add("code3", "f4.py", "fn3", 4);
827        index.add("code2", "f5.py", "fn2", 5);
828
829        assert!((index.dedup_ratio() - 0.4).abs() < 0.001);
830    }
831
832    #[test]
833    fn test_content_hashed_index_clear() {
834        let mut index = ContentHashedIndex::new();
835
836        index.add("code1", "f1.py", "fn1", 1);
837        index.add("code1", "f2.py", "fn1", 2);
838
839        assert!(!index.is_empty());
840        assert_eq!(index.unique_items, 1);
841        assert_eq!(index.duplicates_found, 1);
842
843        index.clear();
844
845        assert!(index.is_empty());
846        assert_eq!(index.unique_items, 0);
847        assert_eq!(index.duplicates_found, 0);
848
849        // After clear, same content is unique again
850        assert!(index.add("code1", "f1.py", "fn1", 1));
851    }
852}