codeprism_core/content/
index.rs

1//! Content indexing for fast search and retrieval
2//!
3//! This module provides efficient indexing of content chunks with support for
4//! full-text search, pattern matching, and content type filtering.
5
6use super::{
7    ChunkId, ContentChunk, ContentNode, ContentStats, ContentType, ContentUpdate,
8    ContentUpdateKind, SearchMatch, SearchQuery, SearchResult,
9};
10
11use anyhow::Result;
12use dashmap::DashMap;
13use regex::Regex;
14use std::collections::HashSet;
15use std::path::{Path, PathBuf};
16use std::sync::{Arc, RwLock};
17use std::time::SystemTime;
18
19/// Content index for fast search and retrieval
20pub struct ContentIndex {
21    /// Content nodes indexed by file path
22    nodes: DashMap<PathBuf, ContentNode>,
23    /// Content chunks indexed by chunk ID
24    chunks: DashMap<ChunkId, ContentChunk>,
25    /// Token index for full-text search
26    token_index: DashMap<String, HashSet<ChunkId>>,
27    /// File pattern index for file discovery
28    file_index: DashMap<String, HashSet<PathBuf>>,
29    /// Content type index for filtering
30    type_index: DashMap<String, HashSet<ChunkId>>,
31    /// Statistics cache
32    stats_cache: Arc<RwLock<Option<ContentStats>>>,
33    /// Update listeners
34    update_listeners: Arc<RwLock<Vec<Box<dyn ContentUpdateListener>>>>,
35}
36
37impl ContentIndex {
38    /// Create a new content index
39    pub fn new() -> Self {
40        Self {
41            nodes: DashMap::new(),
42            chunks: DashMap::new(),
43            token_index: DashMap::new(),
44            file_index: DashMap::new(),
45            type_index: DashMap::new(),
46            stats_cache: Arc::new(RwLock::new(None)),
47            update_listeners: Arc::new(RwLock::new(Vec::new())),
48        }
49    }
50
51    /// Add a content node to the index
52    pub fn add_node(&self, node: ContentNode) -> Result<()> {
53        let file_path = node.file_path.clone();
54
55        // Remove existing node and its chunks
56        if let Some(old_node) = self.nodes.get(&file_path) {
57            for chunk in &old_node.chunks {
58                self.remove_chunk_from_indexes(&chunk.id);
59            }
60        }
61
62        // Index all chunks in the node
63        for chunk in &node.chunks {
64            self.add_chunk_to_indexes(chunk.clone())?;
65        }
66
67        // Index the file pattern
68        self.index_file_pattern(&file_path);
69
70        // Store the node
71        self.nodes.insert(file_path.clone(), node);
72
73        // Invalidate stats cache
74        *self.stats_cache.write().unwrap() = None;
75
76        // Notify listeners
77        self.notify_update(ContentUpdate {
78            file_path,
79            update_kind: ContentUpdateKind::Modified,
80            timestamp: SystemTime::now(),
81        });
82
83        Ok(())
84    }
85
86    /// Remove a content node from the index
87    pub fn remove_node(&self, file_path: &Path) -> Result<()> {
88        if let Some((_, node)) = self.nodes.remove(file_path) {
89            // Remove all chunks from indexes
90            for chunk in &node.chunks {
91                self.remove_chunk_from_indexes(&chunk.id);
92            }
93
94            // Remove file pattern
95            self.remove_file_pattern(file_path);
96
97            // Invalidate stats cache
98            *self.stats_cache.write().unwrap() = None;
99
100            // Notify listeners
101            self.notify_update(ContentUpdate {
102                file_path: file_path.to_path_buf(),
103                update_kind: ContentUpdateKind::Deleted,
104                timestamp: SystemTime::now(),
105            });
106        }
107
108        Ok(())
109    }
110
111    /// Get a content node by file path
112    pub fn get_node(&self, file_path: &Path) -> Option<ContentNode> {
113        self.nodes.get(file_path).map(|entry| entry.value().clone())
114    }
115
116    /// Get a content chunk by ID
117    pub fn get_chunk(&self, chunk_id: &ChunkId) -> Option<ContentChunk> {
118        self.chunks.get(chunk_id).map(|entry| entry.value().clone())
119    }
120
121    /// Search for content
122    pub fn search(&self, query: &SearchQuery) -> Result<Vec<SearchResult>> {
123        let mut results = Vec::new();
124        let mut seen_chunks = HashSet::new();
125
126        // Prepare search regex if needed
127        let search_regex = if query.use_regex {
128            Some(Regex::new(&query.query)?)
129        } else {
130            None
131        };
132
133        // Get candidate chunks based on search strategy
134        let candidate_chunks = if query.use_regex {
135            self.search_by_regex(search_regex.as_ref().unwrap(), query)?
136        } else {
137            self.search_by_tokens(&query.query, query)?
138        };
139
140        // Process candidates and create results
141        for chunk_id in candidate_chunks {
142            if seen_chunks.contains(&chunk_id) {
143                continue;
144            }
145            seen_chunks.insert(chunk_id);
146
147            if let Some(chunk) = self.get_chunk(&chunk_id) {
148                // Filter by content type
149                if !query.content_types.is_empty()
150                    && !self.matches_content_type(&chunk.content_type, &query.content_types)
151                {
152                    continue;
153                }
154
155                // Filter by file patterns
156                if !self.matches_file_patterns(
157                    &chunk.file_path,
158                    &query.file_patterns,
159                    &query.exclude_patterns,
160                )? {
161                    continue;
162                }
163
164                // Find matches within the chunk
165                let matches = self.find_matches_in_chunk(&chunk, query, &search_regex)?;
166                if !matches.is_empty() {
167                    let score = self.calculate_relevance_score(&chunk, &matches, query);
168                    results.push(SearchResult {
169                        chunk: chunk.clone(),
170                        score,
171                        matches,
172                        related_nodes: chunk.related_nodes.clone(),
173                    });
174                }
175            }
176
177            if results.len() >= query.max_results {
178                break;
179            }
180        }
181
182        // Sort by relevance score
183        results.sort_by(|a, b| {
184            b.score
185                .partial_cmp(&a.score)
186                .unwrap_or(std::cmp::Ordering::Equal)
187        });
188
189        Ok(results)
190    }
191
192    /// Find files by pattern
193    pub fn find_files(&self, pattern: &str) -> Result<Vec<PathBuf>> {
194        let pattern_regex = Regex::new(pattern)?;
195        let mut matching_files = Vec::new();
196
197        for entry in self.nodes.iter() {
198            let file_path = entry.key();
199            if pattern_regex.is_match(&file_path.to_string_lossy()) {
200                matching_files.push(file_path.clone());
201            }
202        }
203
204        Ok(matching_files)
205    }
206
207    /// Get content statistics
208    pub fn get_stats(&self) -> ContentStats {
209        // Try to use cached stats
210        if let Ok(cache) = self.stats_cache.read() {
211            if let Some(stats) = cache.as_ref() {
212                return stats.clone();
213            }
214        }
215
216        // Compute fresh stats
217        let stats = self.compute_stats();
218
219        // Cache the stats
220        if let Ok(mut cache) = self.stats_cache.write() {
221            *cache = Some(stats.clone());
222        }
223
224        stats
225    }
226
227    /// Add content update listener
228    pub fn add_update_listener(&self, listener: Box<dyn ContentUpdateListener>) {
229        if let Ok(mut listeners) = self.update_listeners.write() {
230            listeners.push(listener);
231        }
232    }
233
234    /// Clear all content from the index
235    pub fn clear(&self) {
236        self.nodes.clear();
237        self.chunks.clear();
238        self.token_index.clear();
239        self.file_index.clear();
240        self.type_index.clear();
241        *self.stats_cache.write().unwrap() = None;
242    }
243
244    // Private helper methods
245
246    /// Add a chunk to all relevant indexes
247    fn add_chunk_to_indexes(&self, chunk: ContentChunk) -> Result<()> {
248        let chunk_id = chunk.id;
249
250        // Add to token index
251        for token in &chunk.tokens {
252            self.token_index
253                .entry(token.clone())
254                .or_default()
255                .insert(chunk_id);
256        }
257
258        // Add to content type index
259        let type_key = self.content_type_to_string(&chunk.content_type);
260        self.type_index
261            .entry(type_key)
262            .or_default()
263            .insert(chunk_id);
264
265        // Store the chunk
266        self.chunks.insert(chunk_id, chunk);
267
268        Ok(())
269    }
270
271    /// Remove a chunk from all indexes
272    fn remove_chunk_from_indexes(&self, chunk_id: &ChunkId) {
273        // Remove from chunk storage
274        if let Some((_, chunk)) = self.chunks.remove(chunk_id) {
275            // Remove from token index
276            for token in &chunk.tokens {
277                if let Some(mut token_set) = self.token_index.get_mut(token) {
278                    token_set.remove(chunk_id);
279                    if token_set.is_empty() {
280                        drop(token_set);
281                        self.token_index.remove(token);
282                    }
283                }
284            }
285
286            // Remove from content type index
287            let type_key = self.content_type_to_string(&chunk.content_type);
288            if let Some(mut type_set) = self.type_index.get_mut(&type_key) {
289                type_set.remove(chunk_id);
290                if type_set.is_empty() {
291                    drop(type_set);
292                    self.type_index.remove(&type_key);
293                }
294            }
295        }
296    }
297
298    /// Index file pattern for discovery
299    fn index_file_pattern(&self, file_path: &Path) {
300        let file_name = file_path
301            .file_name()
302            .and_then(|name| name.to_str())
303            .unwrap_or("");
304
305        let extension = file_path
306            .extension()
307            .and_then(|ext| ext.to_str())
308            .unwrap_or("");
309
310        // Index by filename
311        self.file_index
312            .entry(file_name.to_lowercase())
313            .or_default()
314            .insert(file_path.to_path_buf());
315
316        // Index by extension
317        if !extension.is_empty() {
318            self.file_index
319                .entry(format!("*.{}", extension.to_lowercase()))
320                .or_default()
321                .insert(file_path.to_path_buf());
322        }
323
324        // Index by full path components
325        for component in file_path.components() {
326            if let Some(component_str) = component.as_os_str().to_str() {
327                self.file_index
328                    .entry(component_str.to_lowercase())
329                    .or_default()
330                    .insert(file_path.to_path_buf());
331            }
332        }
333    }
334
335    /// Remove file pattern from index
336    fn remove_file_pattern(&self, file_path: &Path) {
337        let file_name = file_path
338            .file_name()
339            .and_then(|name| name.to_str())
340            .unwrap_or("");
341
342        let extension = file_path
343            .extension()
344            .and_then(|ext| ext.to_str())
345            .unwrap_or("");
346
347        // Remove from filename index
348        if let Some(mut file_set) = self.file_index.get_mut(&file_name.to_lowercase()) {
349            file_set.remove(file_path);
350            if file_set.is_empty() {
351                drop(file_set);
352                self.file_index.remove(&file_name.to_lowercase());
353            }
354        }
355
356        // Remove from extension index
357        if !extension.is_empty() {
358            let ext_key = format!("*.{}", extension.to_lowercase());
359            if let Some(mut ext_set) = self.file_index.get_mut(&ext_key) {
360                ext_set.remove(file_path);
361                if ext_set.is_empty() {
362                    drop(ext_set);
363                    self.file_index.remove(&ext_key);
364                }
365            }
366        }
367    }
368
369    /// Search by token matching
370    fn search_by_tokens(&self, query: &str, _search_query: &SearchQuery) -> Result<Vec<ChunkId>> {
371        let query_tokens: Vec<String> = query
372            .to_lowercase()
373            .split_whitespace()
374            .map(|s| s.to_string())
375            .collect();
376
377        if query_tokens.is_empty() {
378            return Ok(Vec::new());
379        }
380
381        let mut result_chunks: Option<HashSet<ChunkId>> = None;
382
383        // Find intersection of chunks containing all query tokens
384        for token in &query_tokens {
385            if let Some(chunk_set) = self.token_index.get(token) {
386                let chunk_ids: HashSet<ChunkId> = chunk_set.iter().copied().collect();
387                result_chunks = Some(match result_chunks {
388                    None => chunk_ids,
389                    Some(existing) => existing.intersection(&chunk_ids).copied().collect(),
390                });
391            } else {
392                // If any token is not found, no results
393                return Ok(Vec::new());
394            }
395        }
396
397        Ok(result_chunks.unwrap_or_default().into_iter().collect())
398    }
399
400    /// Search by regex pattern
401    fn search_by_regex(&self, regex: &Regex, search_query: &SearchQuery) -> Result<Vec<ChunkId>> {
402        let mut matching_chunks = Vec::new();
403
404        for entry in self.chunks.iter() {
405            let chunk = entry.value();
406            let content = if search_query.case_sensitive {
407                &chunk.content
408            } else {
409                &chunk.content.to_lowercase()
410            };
411
412            if regex.is_match(content) {
413                matching_chunks.push(chunk.id);
414            }
415        }
416
417        Ok(matching_chunks)
418    }
419
420    /// Find matches within a chunk
421    fn find_matches_in_chunk(
422        &self,
423        chunk: &ContentChunk,
424        query: &SearchQuery,
425        regex: &Option<Regex>,
426    ) -> Result<Vec<SearchMatch>> {
427        let mut matches = Vec::new();
428        let content = if query.case_sensitive {
429            chunk.content.clone()
430        } else {
431            chunk.content.to_lowercase()
432        };
433
434        let search_term = if query.case_sensitive {
435            query.query.clone()
436        } else {
437            query.query.to_lowercase()
438        };
439
440        if let Some(regex) = regex {
441            // Regex search
442            for regex_match in regex.find_iter(&content) {
443                let line_info = self.calculate_line_info(&content, regex_match.start());
444                let search_match = SearchMatch {
445                    text: regex_match.as_str().to_string(),
446                    position: regex_match.start(),
447                    line_number: line_info.0,
448                    column_number: line_info.1,
449                    context_before: if query.include_context {
450                        self.get_context_before(&content, regex_match.start(), query.context_lines)
451                    } else {
452                        None
453                    },
454                    context_after: if query.include_context {
455                        self.get_context_after(&content, regex_match.end(), query.context_lines)
456                    } else {
457                        None
458                    },
459                };
460                matches.push(search_match);
461            }
462        } else {
463            // Simple text search
464            let mut start = 0;
465            while let Some(pos) = content[start..].find(&search_term) {
466                let absolute_pos = start + pos;
467                let line_info = self.calculate_line_info(&content, absolute_pos);
468                let search_match = SearchMatch {
469                    text: search_term.clone(),
470                    position: absolute_pos,
471                    line_number: line_info.0,
472                    column_number: line_info.1,
473                    context_before: if query.include_context {
474                        self.get_context_before(&content, absolute_pos, query.context_lines)
475                    } else {
476                        None
477                    },
478                    context_after: if query.include_context {
479                        self.get_context_after(
480                            &content,
481                            absolute_pos + search_term.len(),
482                            query.context_lines,
483                        )
484                    } else {
485                        None
486                    },
487                };
488                matches.push(search_match);
489                start = absolute_pos + 1;
490            }
491        }
492
493        Ok(matches)
494    }
495
496    /// Calculate line and column information for a position
497    fn calculate_line_info(&self, content: &str, position: usize) -> (usize, usize) {
498        let before_position = &content[..position.min(content.len())];
499        let line_number = before_position.lines().count();
500        let column_number = before_position
501            .lines()
502            .last()
503            .map(|line| line.len() + 1)
504            .unwrap_or(1);
505        (line_number, column_number)
506    }
507
508    /// Get context lines before a position
509    fn get_context_before(
510        &self,
511        content: &str,
512        position: usize,
513        context_lines: usize,
514    ) -> Option<String> {
515        if context_lines == 0 {
516            return None;
517        }
518
519        let lines: Vec<&str> = content.lines().collect();
520        let (line_number, _) = self.calculate_line_info(content, position);
521
522        if line_number == 0 {
523            return None;
524        }
525
526        let start_line = line_number.saturating_sub(context_lines + 1);
527        let end_line = line_number.saturating_sub(1);
528
529        if start_line >= lines.len() || end_line >= lines.len() || start_line > end_line {
530            return None;
531        }
532
533        Some(lines[start_line..=end_line].join("\n"))
534    }
535
536    /// Get context lines after a position
537    fn get_context_after(
538        &self,
539        content: &str,
540        position: usize,
541        context_lines: usize,
542    ) -> Option<String> {
543        if context_lines == 0 {
544            return None;
545        }
546
547        let lines: Vec<&str> = content.lines().collect();
548        let (line_number, _) = self.calculate_line_info(content, position);
549
550        let start_line = line_number;
551        let end_line = (start_line + context_lines).min(lines.len().saturating_sub(1));
552
553        if start_line >= lines.len() || start_line > end_line {
554            return None;
555        }
556
557        Some(lines[start_line..=end_line].join("\n"))
558    }
559
560    /// Calculate relevance score for a search result
561    fn calculate_relevance_score(
562        &self,
563        chunk: &ContentChunk,
564        matches: &[SearchMatch],
565        _query: &SearchQuery,
566    ) -> f32 {
567        if matches.is_empty() {
568            return 0.0;
569        }
570
571        // Base score from content type relevance (0.2-0.8)
572        let type_score = match &chunk.content_type {
573            ContentType::Documentation { .. } => 0.8,
574            ContentType::Comment { context, .. } => match context {
575                super::CommentContext::Documentation => 0.7,
576                super::CommentContext::Function { .. } => 0.6,
577                super::CommentContext::Class { .. } => 0.6,
578                _ => 0.4,
579            },
580            ContentType::Code { .. } => 0.5,
581            ContentType::Configuration { .. } => 0.4,
582            ContentType::PlainText => 0.2,
583        };
584
585        // Match frequency bonus (0.1 per match)
586        let match_bonus = matches.len() as f32 * 0.1;
587
588        // Calculate final score and normalize to 0.0-1.0 range
589        (type_score + match_bonus).min(1.0)
590    }
591
592    /// Check if content type matches query filters
593    fn matches_content_type(
594        &self,
595        content_type: &ContentType,
596        allowed_types: &[ContentType],
597    ) -> bool {
598        allowed_types
599            .iter()
600            .any(|allowed| std::mem::discriminant(content_type) == std::mem::discriminant(allowed))
601    }
602
603    /// Check if file path matches include/exclude patterns
604    fn matches_file_patterns(
605        &self,
606        file_path: &Path,
607        include_patterns: &[String],
608        exclude_patterns: &[String],
609    ) -> Result<bool> {
610        let path_str = file_path.to_string_lossy();
611
612        // Check exclude patterns first
613        for pattern in exclude_patterns {
614            let regex_pattern = self.glob_to_regex(pattern);
615            let regex = Regex::new(&regex_pattern)?;
616            if regex.is_match(&path_str) {
617                return Ok(false);
618            }
619        }
620
621        // If no include patterns, include by default
622        if include_patterns.is_empty() {
623            return Ok(true);
624        }
625
626        // Check include patterns
627        for pattern in include_patterns {
628            let regex_pattern = self.glob_to_regex(pattern);
629            let regex = Regex::new(&regex_pattern)?;
630            if regex.is_match(&path_str) {
631                return Ok(true);
632            }
633        }
634
635        Ok(false)
636    }
637
638    /// Convert glob pattern to regex pattern
639    fn glob_to_regex(&self, glob: &str) -> String {
640        let mut regex = String::new();
641        regex.push('^');
642
643        for ch in glob.chars() {
644            match ch {
645                '*' => regex.push_str(".*"),
646                '?' => regex.push('.'),
647                '.' => regex.push_str("\\."),
648                '+' => regex.push_str("\\+"),
649                '^' => regex.push_str("\\^"),
650                '$' => regex.push_str("\\$"),
651                '(' => regex.push_str("\\("),
652                ')' => regex.push_str("\\)"),
653                '[' => regex.push_str("\\["),
654                ']' => regex.push_str("\\]"),
655                '{' => regex.push_str("\\{"),
656                '}' => regex.push_str("\\}"),
657                '|' => regex.push_str("\\|"),
658                '\\' => regex.push_str("\\\\"),
659                c => regex.push(c),
660            }
661        }
662
663        regex.push('$');
664        regex
665    }
666
667    /// Convert content type to string for indexing
668    fn content_type_to_string(&self, content_type: &ContentType) -> String {
669        match content_type {
670            ContentType::Code { language } => format!("code:{language:?}"),
671            ContentType::Documentation { format } => format!("doc:{format:?}"),
672            ContentType::Configuration { format } => format!("config:{format:?}"),
673            ContentType::Comment { language, context } => {
674                format!("comment:{language:?}:{context:?}")
675            }
676            ContentType::PlainText => "text".to_string(),
677        }
678    }
679
680    /// Compute fresh statistics
681    fn compute_stats(&self) -> ContentStats {
682        let mut stats = ContentStats::new();
683
684        stats.total_files = self.nodes.len();
685        stats.total_chunks = self.chunks.len();
686
687        // Count unique tokens
688        stats.total_tokens = self.token_index.len();
689
690        // Count content by type
691        for entry in self.type_index.iter() {
692            let type_name = entry.key().clone();
693            let chunk_count = entry.value().len();
694            stats.content_by_type.insert(type_name, chunk_count);
695        }
696
697        // File size distribution
698        for entry in self.nodes.iter() {
699            let node = entry.value();
700            let size_bucket = match node.file_size {
701                0..=1024 => "small (0-1KB)",
702                1025..=10240 => "medium (1-10KB)",
703                10241..=102400 => "large (10-100KB)",
704                _ => "very_large (>100KB)",
705            };
706            *stats
707                .size_distribution
708                .entry(size_bucket.to_string())
709                .or_insert(0) += 1;
710        }
711
712        stats.computed_at = SystemTime::now();
713        stats
714    }
715
716    /// Notify update listeners
717    fn notify_update(&self, update: ContentUpdate) {
718        if let Ok(listeners) = self.update_listeners.read() {
719            for listener in listeners.iter() {
720                listener.on_content_update(&update);
721            }
722        }
723    }
724}
725
726impl Default for ContentIndex {
727    fn default() -> Self {
728        Self::new()
729    }
730}
731
732/// Trait for content update listeners
733pub trait ContentUpdateListener: Send + Sync {
734    /// Called when content is updated
735    fn on_content_update(&self, update: &ContentUpdate);
736}
737
738/// Simple logging update listener
739pub struct LoggingUpdateListener;
740
741impl ContentUpdateListener for LoggingUpdateListener {
742    fn on_content_update(&self, update: &ContentUpdate) {
743        eprintln!(
744            "Content updated: {:?} at {:?}",
745            update.file_path, update.timestamp
746        );
747    }
748}
749
750#[cfg(test)]
751mod tests {
752    use super::*;
753    use crate::ast::Span;
754    use crate::content::ChunkId;
755    use crate::{ConfigFormat, DocumentFormat};
756    use std::path::Path;
757
758    fn create_test_chunk(
759        file_path: &Path,
760        content: &str,
761        content_type: ContentType,
762        chunk_index: usize,
763    ) -> ContentChunk {
764        let span = Span::new(0, content.len(), 1, 1, 1, content.len());
765        ContentChunk::new(
766            file_path.to_path_buf(),
767            content_type,
768            content.to_string(),
769            span,
770            chunk_index,
771        )
772    }
773
774    fn create_test_node(file_path: &Path, chunks: Vec<ContentChunk>) -> ContentNode {
775        let mut node = ContentNode::new(file_path.to_path_buf(), chunks[0].content_type.clone());
776        for chunk in chunks {
777            node.add_chunk(chunk);
778        }
779        node.file_size = 1000; // Dummy size
780        node
781    }
782
783    #[test]
784    fn test_content_index_creation() {
785        let index = ContentIndex::new();
786
787        // Test default implementation
788        let _index_default = ContentIndex::default();
789
790        // Initially empty
791        let stats = index.get_stats();
792        assert_eq!(stats.total_files, 0);
793        assert_eq!(stats.total_chunks, 0);
794    }
795
796    #[test]
797    fn test_add_and_get_node() {
798        let index = ContentIndex::new();
799        let file_path = Path::new("test.md");
800
801        // Create test content
802        let chunk = create_test_chunk(
803            file_path,
804            "# Test Document\n\nThis is a test.",
805            ContentType::Documentation {
806                format: DocumentFormat::Markdown,
807            },
808            0,
809        );
810        let node = create_test_node(file_path, vec![chunk]);
811
812        // Add node to index
813        let result = index.add_node(node.clone());
814        assert!(result.is_ok(), "Adding valid content node should succeed");
815
816        // Retrieve the node and verify its content
817        let retrieved_node = index.get_node(file_path);
818        assert!(
819            retrieved_node.is_some(),
820            "Should be able to retrieve added node"
821        );
822        let retrieved_node = retrieved_node.unwrap();
823        assert_eq!(
824            retrieved_node.file_path, file_path,
825            "Retrieved node should have correct file path"
826        );
827        assert_eq!(
828            retrieved_node.chunks.len(),
829            1,
830            "Retrieved node should have 1 chunk"
831        );
832
833        // Verify chunk content was preserved
834        assert_eq!(
835            retrieved_node.chunks[0].content, "# Test Document\n\nThis is a test.",
836            "Chunk content should be preserved"
837        );
838        assert!(
839            matches!(
840                retrieved_node.chunks[0].content_type,
841                ContentType::Documentation { .. }
842            ),
843            "Content type should be preserved"
844        );
845
846        // Verify index statistics updated
847        let stats = index.get_stats();
848        assert_eq!(stats.total_files, 1, "Stats should show 1 file");
849        assert_eq!(stats.total_chunks, 1, "Stats should show 1 chunk");
850    }
851
852    #[test]
853    fn test_add_node_replaces_existing() {
854        let index = ContentIndex::new();
855        let file_path = Path::new("test.md");
856
857        // Add first version
858        let chunk1 = create_test_chunk(
859            file_path,
860            "Original content",
861            ContentType::Documentation {
862                format: DocumentFormat::Markdown,
863            },
864            0,
865        );
866        let node1 = create_test_node(file_path, vec![chunk1]);
867        let _ = index.add_node(node1);
868
869        // Add updated version
870        let chunk2 = create_test_chunk(
871            file_path,
872            "Updated content",
873            ContentType::Documentation {
874                format: DocumentFormat::Markdown,
875            },
876            1,
877        );
878        let node2 = create_test_node(file_path, vec![chunk2]);
879        let _ = index.add_node(node2);
880
881        // Should have the updated content
882        let retrieved_node = index.get_node(file_path).unwrap();
883        assert_eq!(retrieved_node.chunks[0].content, "Updated content");
884    }
885
886    #[test]
887    fn test_remove_node() {
888        let index = ContentIndex::new();
889        let file_path = Path::new("test.md");
890
891        // Add a node
892        let chunk = create_test_chunk(
893            file_path,
894            "Test content",
895            ContentType::Documentation {
896                format: DocumentFormat::Markdown,
897            },
898            0,
899        );
900        let node = create_test_node(file_path, vec![chunk]);
901        let _ = index.add_node(node);
902
903        // Verify it exists
904        assert!(
905            index.get_node(file_path).is_some(),
906            "Node should exist after adding"
907        );
908        let retrieved_node = index.get_node(file_path).unwrap();
909        assert_eq!(
910            retrieved_node.file_path, file_path,
911            "Retrieved node should have correct path"
912        );
913        assert!(
914            !retrieved_node.chunks.is_empty(),
915            "Retrieved node should have chunks"
916        );
917
918        // Remove it
919        let result = index.remove_node(file_path);
920        assert!(result.is_ok(), "Operation should succeed");
921
922        // Verify it's gone
923        assert!(index.get_node(file_path).is_none());
924    }
925
926    #[test]
927    fn test_get_chunk() {
928        let index = ContentIndex::new();
929        let file_path = Path::new("test.md");
930
931        let chunk = create_test_chunk(
932            file_path,
933            "Test content",
934            ContentType::Documentation {
935                format: DocumentFormat::Markdown,
936            },
937            42,
938        );
939        let chunk_id = chunk.id;
940        let node = create_test_node(file_path, vec![chunk]);
941
942        let _ = index.add_node(node);
943
944        // Should be able to retrieve chunk by ID
945        let retrieved_chunk = index.get_chunk(&chunk_id);
946        assert!(retrieved_chunk.is_some(), "Should have value");
947        assert_eq!(retrieved_chunk.unwrap().content, "Test content");
948
949        // Non-existent chunk should return None
950        let fake_chunk_id = ChunkId::new(Path::new("nonexistent.md"), 9999, &[0u8; 32]);
951        let non_existent = index.get_chunk(&fake_chunk_id);
952        assert!(non_existent.is_none(), "Should be none");
953    }
954
955    #[test]
956    fn test_simple_text_search() {
957        let index = ContentIndex::new();
958
959        // Add some test content
960        let file1 = Path::new("doc1.md");
961        let chunk1 = create_test_chunk(
962            file1,
963            "This is a test document about programming",
964            ContentType::Documentation {
965                format: DocumentFormat::Markdown,
966            },
967            1,
968        );
969        let node1 = create_test_node(file1, vec![chunk1]);
970        let _ = index.add_node(node1);
971
972        let file2 = Path::new("doc2.md");
973        let chunk2 = create_test_chunk(
974            file2,
975            "Another document for testing purposes",
976            ContentType::Documentation {
977                format: DocumentFormat::Markdown,
978            },
979            2,
980        );
981        let node2 = create_test_node(file2, vec![chunk2]);
982        let _ = index.add_node(node2);
983
984        // Search for "document" (which should be in both)
985        let search_query = SearchQuery {
986            query: "document".to_string(),
987            max_results: 10,
988            ..Default::default()
989        };
990
991        let results = index.search(&search_query).unwrap();
992        assert!(!results.is_empty(), "Should not be empty");
993
994        // Should find matches in both documents
995        let result_contents: Vec<_> = results.iter().map(|r| &r.chunk.content).collect();
996        assert!(result_contents
997            .iter()
998            .any(|content| content.contains("programming")));
999        assert!(result_contents
1000            .iter()
1001            .any(|content| content.contains("testing")));
1002    }
1003
1004    #[test]
1005    fn test_regex_search() {
1006        let index = ContentIndex::new();
1007
1008        // Add content with email addresses
1009        let file_path = Path::new("contacts.md");
1010        let chunk = create_test_chunk(
1011            file_path,
1012            "Contact John at john@example.com or Mary at mary@test.org",
1013            ContentType::Documentation {
1014                format: DocumentFormat::Markdown,
1015            },
1016            1,
1017        );
1018        let node = create_test_node(file_path, vec![chunk]);
1019        let _ = index.add_node(node);
1020
1021        // Search with regex pattern
1022        let search_query = SearchQuery {
1023            query: r"\b\w+@\w+\.\w+\b".to_string(),
1024            use_regex: true,
1025            max_results: 10,
1026            ..Default::default()
1027        };
1028
1029        let results = index.search(&search_query).unwrap();
1030        assert!(!results.is_empty(), "Should not be empty");
1031
1032        // Should find email matches
1033        let result = &results[0];
1034        assert!(!result.matches.is_empty(), "Should not be empty");
1035    }
1036
1037    #[test]
1038    fn test_search_with_content_type_filter() {
1039        let index = ContentIndex::new();
1040
1041        // Add different content types
1042        let md_file = Path::new("doc.md");
1043        let md_chunk = create_test_chunk(
1044            md_file,
1045            "Documentation content",
1046            ContentType::Documentation {
1047                format: DocumentFormat::Markdown,
1048            },
1049            1,
1050        );
1051        let md_node = create_test_node(md_file, vec![md_chunk]);
1052        let _ = index.add_node(md_node);
1053
1054        let json_file = Path::new("config.json");
1055        let json_chunk = create_test_chunk(
1056            json_file,
1057            r#"{"config": "content"}"#,
1058            ContentType::Configuration {
1059                format: ConfigFormat::Json,
1060            },
1061            2,
1062        );
1063        let json_node = create_test_node(json_file, vec![json_chunk]);
1064        let _ = index.add_node(json_node);
1065
1066        // Search only in documentation
1067        let search_query = SearchQuery {
1068            query: "content".to_string(),
1069            content_types: vec![ContentType::Documentation {
1070                format: DocumentFormat::Markdown,
1071            }],
1072            max_results: 10,
1073            ..Default::default()
1074        };
1075
1076        let results = index.search(&search_query).unwrap();
1077        assert_eq!(results.len(), 1, "Should have 1 items");
1078        assert!(results[0].chunk.content.contains("Documentation"));
1079    }
1080
1081    #[test]
1082    fn test_search_with_file_patterns() {
1083        let index = ContentIndex::new();
1084
1085        // Add files with different extensions
1086        let md_file = Path::new("test.md");
1087        let md_chunk = create_test_chunk(
1088            md_file,
1089            "Markdown content",
1090            ContentType::Documentation {
1091                format: DocumentFormat::Markdown,
1092            },
1093            1,
1094        );
1095        let md_node = create_test_node(md_file, vec![md_chunk]);
1096        let _ = index.add_node(md_node);
1097
1098        let txt_file = Path::new("test.txt");
1099        let txt_chunk = create_test_chunk(
1100            txt_file,
1101            "Text content",
1102            ContentType::Documentation {
1103                format: DocumentFormat::PlainText,
1104            },
1105            2,
1106        );
1107        let txt_node = create_test_node(txt_file, vec![txt_chunk]);
1108        let _ = index.add_node(txt_node);
1109
1110        // Search only in .md files
1111        let search_query = SearchQuery {
1112            query: "content".to_string(),
1113            file_patterns: vec!["*.md".to_string()],
1114            max_results: 10,
1115            ..Default::default()
1116        };
1117
1118        let results = index.search(&search_query).unwrap();
1119        assert_eq!(results.len(), 1, "Should have 1 items");
1120        assert!(results[0].chunk.content.contains("Markdown"));
1121    }
1122
1123    #[test]
1124    fn test_search_with_exclude_patterns() {
1125        let index = ContentIndex::new();
1126
1127        // Add test files
1128        let md_file = Path::new("test.md");
1129        let md_chunk = create_test_chunk(
1130            md_file,
1131            "Markdown content",
1132            ContentType::Documentation {
1133                format: DocumentFormat::Markdown,
1134            },
1135            1,
1136        );
1137        let md_node = create_test_node(md_file, vec![md_chunk]);
1138        let _ = index.add_node(md_node);
1139
1140        let tmp_file = Path::new("temp.tmp");
1141        let tmp_chunk = create_test_chunk(
1142            tmp_file,
1143            "Temporary content",
1144            ContentType::Documentation {
1145                format: DocumentFormat::PlainText,
1146            },
1147            2,
1148        );
1149        let tmp_node = create_test_node(tmp_file, vec![tmp_chunk]);
1150        let _ = index.add_node(tmp_node);
1151
1152        // Search excluding .tmp files
1153        let search_query = SearchQuery {
1154            query: "content".to_string(),
1155            exclude_patterns: vec!["*.tmp".to_string()],
1156            max_results: 10,
1157            ..Default::default()
1158        };
1159
1160        let results = index.search(&search_query).unwrap();
1161        assert_eq!(results.len(), 1, "Should have 1 items");
1162        assert!(results[0].chunk.content.contains("Markdown"));
1163    }
1164
1165    #[test]
1166    fn test_search_with_context() {
1167        let index = ContentIndex::new();
1168
1169        let file_path = Path::new("test.md");
1170        let content = "Line 1\nLine 2 with target\nLine 3\nLine 4";
1171        let chunk = create_test_chunk(
1172            file_path,
1173            content,
1174            ContentType::Documentation {
1175                format: DocumentFormat::Markdown,
1176            },
1177            1,
1178        );
1179        let node = create_test_node(file_path, vec![chunk]);
1180        let _ = index.add_node(node);
1181
1182        // Search with context
1183        let search_query = SearchQuery {
1184            query: "target".to_string(),
1185            include_context: true,
1186            context_lines: 1,
1187            max_results: 10,
1188            ..Default::default()
1189        };
1190
1191        let results = index.search(&search_query).unwrap();
1192        assert!(!results.is_empty(), "Should not be empty");
1193
1194        let result = &results[0];
1195        assert!(!result.matches.is_empty(), "Should not be empty");
1196
1197        // Should have context before and after
1198        let search_match = &result.matches[0];
1199        assert!(search_match.context_before.is_some(), "Should have value");
1200        assert!(search_match.context_after.is_some(), "Should have value");
1201    }
1202
1203    #[test]
1204    fn test_search_case_sensitive() {
1205        let index = ContentIndex::new();
1206
1207        let file_path = Path::new("test.md");
1208        let chunk = create_test_chunk(
1209            file_path,
1210            "Test with UPPERCASE and lowercase",
1211            ContentType::Documentation {
1212                format: DocumentFormat::Markdown,
1213            },
1214            1,
1215        );
1216        let node = create_test_node(file_path, vec![chunk]);
1217        let _ = index.add_node(node);
1218
1219        // Case sensitive search
1220        let search_query = SearchQuery {
1221            query: "UPPERCASE".to_string(),
1222            case_sensitive: true,
1223            max_results: 10,
1224            ..Default::default()
1225        };
1226
1227        let results = index.search(&search_query).unwrap();
1228        assert!(!results.is_empty(), "Should not be empty");
1229
1230        // Should not match lowercase
1231        let search_query_lower = SearchQuery {
1232            query: "uppercase".to_string(),
1233            case_sensitive: true,
1234            max_results: 10,
1235            ..Default::default()
1236        };
1237
1238        let results_lower = index.search(&search_query_lower).unwrap();
1239        assert!(
1240            results_lower.is_empty(),
1241            "Should be empty for case mismatch"
1242        );
1243    }
1244
1245    #[test]
1246    fn test_search_max_results() {
1247        let index = ContentIndex::new();
1248
1249        // Add multiple documents with the same term
1250        for i in 0..10 {
1251            let file_path = PathBuf::from(format!("doc{i}.md"));
1252            let chunk = create_test_chunk(
1253                &file_path,
1254                &format!("Document {i} contains the search term"),
1255                ContentType::Documentation {
1256                    format: DocumentFormat::Markdown,
1257                },
1258                i,
1259            );
1260            let node = create_test_node(&file_path, vec![chunk]);
1261            let _ = index.add_node(node);
1262        }
1263
1264        // Search with max results limit
1265        let search_query = SearchQuery {
1266            query: "search".to_string(),
1267            max_results: 3,
1268            ..Default::default()
1269        };
1270
1271        let results = index.search(&search_query).unwrap();
1272        assert_eq!(results.len(), 3, "Should have 3 items");
1273    }
1274
1275    #[test]
1276    fn test_find_files() {
1277        let index = ContentIndex::new();
1278
1279        // Add files with different patterns
1280        let files = ["test_one.md", "test_two.md", "other.txt", "config.json"];
1281        for (i, file_name) in files.iter().enumerate() {
1282            let file_path = Path::new(file_name);
1283            let chunk = create_test_chunk(
1284                file_path,
1285                &format!("Content {i}"),
1286                ContentType::Documentation {
1287                    format: DocumentFormat::Markdown,
1288                },
1289                i,
1290            );
1291            let node = create_test_node(file_path, vec![chunk]);
1292            let _ = index.add_node(node);
1293        }
1294
1295        // Find markdown files
1296        let md_files = index.find_files(r"\.md$").unwrap();
1297        assert_eq!(md_files.len(), 2, "Should have 2 items");
1298
1299        // Find test files
1300        let test_files = index.find_files(r"test_").unwrap();
1301        assert_eq!(test_files.len(), 2, "Should have 2 items");
1302
1303        // Find all files
1304        let all_files = index.find_files(r".*").unwrap();
1305        assert_eq!(all_files.len(), 4, "Should have 4 items");
1306    }
1307
1308    #[test]
1309    fn test_content_stats() {
1310        let index = ContentIndex::new();
1311
1312        // Initially empty
1313        let stats = index.get_stats();
1314        assert_eq!(stats.total_files, 0);
1315        assert_eq!(stats.total_chunks, 0);
1316
1317        // Add some content
1318        let file1 = Path::new("doc1.md");
1319        let chunk1 = create_test_chunk(
1320            file1,
1321            "First document",
1322            ContentType::Documentation {
1323                format: DocumentFormat::Markdown,
1324            },
1325            1,
1326        );
1327        let node1 = create_test_node(file1, vec![chunk1]);
1328        let _ = index.add_node(node1);
1329
1330        let file2 = Path::new("doc2.md");
1331        let chunk2a = create_test_chunk(
1332            file2,
1333            "Second document first chunk",
1334            ContentType::Documentation {
1335                format: DocumentFormat::Markdown,
1336            },
1337            2,
1338        );
1339        let chunk2b = create_test_chunk(
1340            file2,
1341            "Second document second chunk",
1342            ContentType::Documentation {
1343                format: DocumentFormat::Markdown,
1344            },
1345            3,
1346        );
1347        let node2 = create_test_node(file2, vec![chunk2a, chunk2b]);
1348        let _ = index.add_node(node2);
1349
1350        // Check updated stats
1351        let stats = index.get_stats();
1352        assert_eq!(stats.total_files, 2);
1353        assert_eq!(stats.total_chunks, 3);
1354    }
1355
1356    #[test]
1357    fn test_content_update_listeners() {
1358        struct TestListener {
1359            updates: Arc<std::sync::Mutex<Vec<ContentUpdate>>>,
1360        }
1361
1362        impl ContentUpdateListener for TestListener {
1363            fn on_content_update(&self, update: &ContentUpdate) {
1364                self.updates.lock().unwrap().push(update.clone());
1365            }
1366        }
1367
1368        let index = ContentIndex::new();
1369        let updates = Arc::new(std::sync::Mutex::new(Vec::new()));
1370        let listener = TestListener {
1371            updates: updates.clone(),
1372        };
1373
1374        index.add_update_listener(Box::new(listener));
1375
1376        // Add a node
1377        let file_path = Path::new("test.md");
1378        let chunk = create_test_chunk(
1379            file_path,
1380            "Test content",
1381            ContentType::Documentation {
1382                format: DocumentFormat::Markdown,
1383            },
1384            1,
1385        );
1386        let node = create_test_node(file_path, vec![chunk]);
1387        let _ = index.add_node(node);
1388
1389        // Should have received update notification
1390        let updates = updates.lock().unwrap();
1391        assert_eq!(updates.len(), 1, "Should have 1 items");
1392        assert_eq!(updates[0].file_path, file_path);
1393        assert!(matches!(
1394            updates[0].update_kind,
1395            ContentUpdateKind::Modified
1396        ));
1397    }
1398
1399    #[test]
1400    fn test_clear() {
1401        let index = ContentIndex::new();
1402
1403        // Add some content
1404        let file_path = Path::new("test.md");
1405        let chunk = create_test_chunk(
1406            file_path,
1407            "Test content",
1408            ContentType::Documentation {
1409                format: DocumentFormat::Markdown,
1410            },
1411            1,
1412        );
1413        let node = create_test_node(file_path, vec![chunk]);
1414        let _ = index.add_node(node);
1415
1416        // Verify content exists and validate its properties
1417        assert!(
1418            index.get_node(file_path).is_some(),
1419            "Node should exist after adding"
1420        );
1421        let retrieved_node = index.get_node(file_path).unwrap();
1422        assert_eq!(
1423            retrieved_node.file_path, file_path,
1424            "Retrieved node should have correct file path"
1425        );
1426        assert!(
1427            !retrieved_node.chunks.is_empty(),
1428            "Retrieved node should have chunks"
1429        );
1430        assert_eq!(
1431            retrieved_node.chunks[0].content, "Test content",
1432            "Chunk should have correct content"
1433        );
1434
1435        let stats = index.get_stats();
1436        assert!(
1437            stats.total_files > 0,
1438            "Stats should show files after adding content"
1439        );
1440        assert_eq!(stats.total_files, 1, "Should have exactly 1 file");
1441
1442        // Clear all content
1443        index.clear();
1444
1445        // Verify content is gone
1446        assert!(index.get_node(file_path).is_none());
1447        let stats = index.get_stats();
1448        assert_eq!(stats.total_files, 0);
1449        assert_eq!(stats.total_chunks, 0);
1450    }
1451
1452    #[test]
1453    fn test_invalid_regex_search() {
1454        let index = ContentIndex::new();
1455
1456        // Add some content
1457        let file_path = Path::new("test.md");
1458        let chunk = create_test_chunk(
1459            file_path,
1460            "Test content",
1461            ContentType::Documentation {
1462                format: DocumentFormat::Markdown,
1463            },
1464            1,
1465        );
1466        let node = create_test_node(file_path, vec![chunk]);
1467        let _ = index.add_node(node);
1468
1469        // Try search with invalid regex
1470        let search_query = SearchQuery {
1471            query: "[invalid".to_string(),
1472            use_regex: true,
1473            max_results: 10,
1474            ..Default::default()
1475        };
1476
1477        let result = index.search(&search_query);
1478        assert!(result.is_err());
1479    }
1480
1481    #[test]
1482    fn test_logging_update_listener() {
1483        let listener = LoggingUpdateListener;
1484        let update = ContentUpdate {
1485            file_path: PathBuf::from("test.md"),
1486            update_kind: ContentUpdateKind::Modified,
1487            timestamp: SystemTime::now(),
1488        };
1489
1490        // Should not panic
1491        listener.on_content_update(&update);
1492    }
1493
1494    #[test]
1495    fn test_line_info_calculation() {
1496        let index = ContentIndex::new();
1497
1498        let content = "Line 1\nLine 2\nLine 3 with text\nLine 4";
1499        let position = content.find("text").unwrap();
1500
1501        let (line, column) = index.calculate_line_info(content, position);
1502        assert_eq!(line, 3); // Line number (1-indexed)
1503        assert!(column > 1); // Column position
1504    }
1505
1506    #[test]
1507    fn test_context_extraction() {
1508        let index = ContentIndex::new();
1509
1510        let content = "Line 1\nLine 2\nLine 3 target\nLine 4\nLine 5";
1511        let position = content.find("target").unwrap();
1512
1513        // Test context before
1514        let context_before = index.get_context_before(content, position, 1);
1515        assert!(context_before.is_some(), "Should have value");
1516        assert!(context_before.unwrap().contains("Line 2"));
1517
1518        // Test context after
1519        let context_after = index.get_context_after(content, position + 6, 1);
1520        assert!(context_after.is_some(), "Should have value");
1521        assert!(context_after.unwrap().contains("Line 4"));
1522
1523        // Test with zero context lines
1524        let no_context = index.get_context_before(content, position, 0);
1525        assert!(no_context.is_none(), "Should be none");
1526    }
1527
1528    #[test]
1529    fn test_relevance_score_calculation() {
1530        let index = ContentIndex::new();
1531
1532        let file_path = Path::new("test.md");
1533        let chunk = create_test_chunk(
1534            file_path,
1535            "Test document with multiple test occurrences",
1536            ContentType::Documentation {
1537                format: DocumentFormat::Markdown,
1538            },
1539            1,
1540        );
1541
1542        let matches = vec![
1543            SearchMatch {
1544                text: "test".to_string(),
1545                position: 0,
1546                line_number: 1,
1547                column_number: 1,
1548                context_before: None,
1549                context_after: None,
1550            },
1551            SearchMatch {
1552                text: "test".to_string(),
1553                position: 30,
1554                line_number: 1,
1555                column_number: 31,
1556                context_before: None,
1557                context_after: None,
1558            },
1559        ];
1560
1561        let query = SearchQuery {
1562            query: "test".to_string(),
1563            ..Default::default()
1564        };
1565
1566        let score = index.calculate_relevance_score(&chunk, &matches, &query);
1567        assert!(score > 0.0);
1568
1569        // More matches should give higher score
1570        let single_match = vec![matches[0].clone()];
1571        let single_score = index.calculate_relevance_score(&chunk, &single_match, &query);
1572        assert!(score > single_score);
1573    }
1574}
codeprism_core/content/index.rs

codeprism_core/content/
index.rs