codeprism_core/content/
index.rs

1//! Content indexing for fast search and retrieval
2//!
3//! This module provides efficient indexing of content chunks with support for
4//! full-text search, pattern matching, and content type filtering.
5
6use super::{
7    ChunkId, ContentChunk, ContentNode, ContentStats, ContentType, ContentUpdate,
8    ContentUpdateKind, SearchMatch, SearchQuery, SearchResult,
9};
10
11use anyhow::Result;
12use dashmap::DashMap;
13use regex::Regex;
14use std::collections::HashSet;
15use std::path::{Path, PathBuf};
16use std::sync::{Arc, RwLock};
17use std::time::SystemTime;
18
19/// Content index for fast search and retrieval
20pub struct ContentIndex {
21    /// Content nodes indexed by file path
22    nodes: DashMap<PathBuf, ContentNode>,
23    /// Content chunks indexed by chunk ID
24    chunks: DashMap<ChunkId, ContentChunk>,
25    /// Token index for full-text search
26    token_index: DashMap<String, HashSet<ChunkId>>,
27    /// File pattern index for file discovery
28    file_index: DashMap<String, HashSet<PathBuf>>,
29    /// Content type index for filtering
30    type_index: DashMap<String, HashSet<ChunkId>>,
31    /// Statistics cache
32    stats_cache: Arc<RwLock<Option<ContentStats>>>,
33    /// Update listeners
34    update_listeners: Arc<RwLock<Vec<Box<dyn ContentUpdateListener>>>>,
35}
36
37impl ContentIndex {
38    /// Create a new content index
39    pub fn new() -> Self {
40        Self {
41            nodes: DashMap::new(),
42            chunks: DashMap::new(),
43            token_index: DashMap::new(),
44            file_index: DashMap::new(),
45            type_index: DashMap::new(),
46            stats_cache: Arc::new(RwLock::new(None)),
47            update_listeners: Arc::new(RwLock::new(Vec::new())),
48        }
49    }
50
51    /// Add a content node to the index
52    pub fn add_node(&self, node: ContentNode) -> Result<()> {
53        let file_path = node.file_path.clone();
54
55        // Remove existing node and its chunks
56        if let Some(old_node) = self.nodes.get(&file_path) {
57            for chunk in &old_node.chunks {
58                self.remove_chunk_from_indexes(&chunk.id);
59            }
60        }
61
62        // Index all chunks in the node
63        for chunk in &node.chunks {
64            self.add_chunk_to_indexes(chunk.clone())?;
65        }
66
67        // Index the file pattern
68        self.index_file_pattern(&file_path);
69
70        // Store the node
71        self.nodes.insert(file_path.clone(), node);
72
73        // Invalidate stats cache
74        *self.stats_cache.write().unwrap() = None;
75
76        // Notify listeners
77        self.notify_update(ContentUpdate {
78            file_path,
79            update_kind: ContentUpdateKind::Modified,
80            timestamp: SystemTime::now(),
81        });
82
83        Ok(())
84    }
85
86    /// Remove a content node from the index
87    pub fn remove_node(&self, file_path: &Path) -> Result<()> {
88        if let Some((_, node)) = self.nodes.remove(file_path) {
89            // Remove all chunks from indexes
90            for chunk in &node.chunks {
91                self.remove_chunk_from_indexes(&chunk.id);
92            }
93
94            // Remove file pattern
95            self.remove_file_pattern(file_path);
96
97            // Invalidate stats cache
98            *self.stats_cache.write().unwrap() = None;
99
100            // Notify listeners
101            self.notify_update(ContentUpdate {
102                file_path: file_path.to_path_buf(),
103                update_kind: ContentUpdateKind::Deleted,
104                timestamp: SystemTime::now(),
105            });
106        }
107
108        Ok(())
109    }
110
111    /// Get a content node by file path
112    pub fn get_node(&self, file_path: &Path) -> Option<ContentNode> {
113        self.nodes.get(file_path).map(|entry| entry.value().clone())
114    }
115
116    /// Get a content chunk by ID
117    pub fn get_chunk(&self, chunk_id: &ChunkId) -> Option<ContentChunk> {
118        self.chunks.get(chunk_id).map(|entry| entry.value().clone())
119    }
120
121    /// Search for content
122    pub fn search(&self, query: &SearchQuery) -> Result<Vec<SearchResult>> {
123        let mut results = Vec::new();
124        let mut seen_chunks = HashSet::new();
125
126        // Prepare search regex if needed
127        let search_regex = if query.use_regex {
128            Some(Regex::new(&query.query)?)
129        } else {
130            None
131        };
132
133        // Get candidate chunks based on search strategy
134        let candidate_chunks = if query.use_regex {
135            self.search_by_regex(search_regex.as_ref().unwrap(), query)?
136        } else {
137            self.search_by_tokens(&query.query, query)?
138        };
139
140        // Process candidates and create results
141        for chunk_id in candidate_chunks {
142            if seen_chunks.contains(&chunk_id) {
143                continue;
144            }
145            seen_chunks.insert(chunk_id);
146
147            if let Some(chunk) = self.get_chunk(&chunk_id) {
148                // Filter by content type
149                if !query.content_types.is_empty()
150                    && !self.matches_content_type(&chunk.content_type, &query.content_types)
151                {
152                    continue;
153                }
154
155                // Filter by file patterns
156                if !self.matches_file_patterns(
157                    &chunk.file_path,
158                    &query.file_patterns,
159                    &query.exclude_patterns,
160                )? {
161                    continue;
162                }
163
164                // Find matches within the chunk
165                let matches = self.find_matches_in_chunk(&chunk, query, &search_regex)?;
166                if !matches.is_empty() {
167                    let score = self.calculate_relevance_score(&chunk, &matches, query);
168                    results.push(SearchResult {
169                        chunk: chunk.clone(),
170                        score,
171                        matches,
172                        related_nodes: chunk.related_nodes.clone(),
173                    });
174                }
175            }
176
177            if results.len() >= query.max_results {
178                break;
179            }
180        }
181
182        // Sort by relevance score
183        results.sort_by(|a, b| {
184            b.score
185                .partial_cmp(&a.score)
186                .unwrap_or(std::cmp::Ordering::Equal)
187        });
188
189        Ok(results)
190    }
191
192    /// Find files by pattern
193    pub fn find_files(&self, pattern: &str) -> Result<Vec<PathBuf>> {
194        let pattern_regex = Regex::new(pattern)?;
195        let mut matching_files = Vec::new();
196
197        for entry in self.nodes.iter() {
198            let file_path = entry.key();
199            if pattern_regex.is_match(&file_path.to_string_lossy()) {
200                matching_files.push(file_path.clone());
201            }
202        }
203
204        Ok(matching_files)
205    }
206
207    /// Get content statistics
208    pub fn get_stats(&self) -> ContentStats {
209        // Try to use cached stats
210        if let Ok(cache) = self.stats_cache.read() {
211            if let Some(stats) = cache.as_ref() {
212                return stats.clone();
213            }
214        }
215
216        // Compute fresh stats
217        let stats = self.compute_stats();
218
219        // Cache the stats
220        if let Ok(mut cache) = self.stats_cache.write() {
221            *cache = Some(stats.clone());
222        }
223
224        stats
225    }
226
227    /// Add content update listener
228    pub fn add_update_listener(&self, listener: Box<dyn ContentUpdateListener>) {
229        if let Ok(mut listeners) = self.update_listeners.write() {
230            listeners.push(listener);
231        }
232    }
233
234    /// Clear all content from the index
235    pub fn clear(&self) {
236        self.nodes.clear();
237        self.chunks.clear();
238        self.token_index.clear();
239        self.file_index.clear();
240        self.type_index.clear();
241        *self.stats_cache.write().unwrap() = None;
242    }
243
244    // Private helper methods
245
246    /// Add a chunk to all relevant indexes
247    fn add_chunk_to_indexes(&self, chunk: ContentChunk) -> Result<()> {
248        let chunk_id = chunk.id;
249
250        // Add to token index
251        for token in &chunk.tokens {
252            self.token_index
253                .entry(token.clone())
254                .or_default()
255                .insert(chunk_id);
256        }
257
258        // Add to content type index
259        let type_key = self.content_type_to_string(&chunk.content_type);
260        self.type_index
261            .entry(type_key)
262            .or_default()
263            .insert(chunk_id);
264
265        // Store the chunk
266        self.chunks.insert(chunk_id, chunk);
267
268        Ok(())
269    }
270
271    /// Remove a chunk from all indexes
272    fn remove_chunk_from_indexes(&self, chunk_id: &ChunkId) {
273        // Remove from chunk storage
274        if let Some((_, chunk)) = self.chunks.remove(chunk_id) {
275            // Remove from token index
276            for token in &chunk.tokens {
277                if let Some(mut token_set) = self.token_index.get_mut(token) {
278                    token_set.remove(chunk_id);
279                    if token_set.is_empty() {
280                        drop(token_set);
281                        self.token_index.remove(token);
282                    }
283                }
284            }
285
286            // Remove from content type index
287            let type_key = self.content_type_to_string(&chunk.content_type);
288            if let Some(mut type_set) = self.type_index.get_mut(&type_key) {
289                type_set.remove(chunk_id);
290                if type_set.is_empty() {
291                    drop(type_set);
292                    self.type_index.remove(&type_key);
293                }
294            }
295        }
296    }
297
298    /// Index file pattern for discovery
299    fn index_file_pattern(&self, file_path: &Path) {
300        let file_name = file_path
301            .file_name()
302            .and_then(|name| name.to_str())
303            .unwrap_or("");
304
305        let extension = file_path
306            .extension()
307            .and_then(|ext| ext.to_str())
308            .unwrap_or("");
309
310        // Index by filename
311        self.file_index
312            .entry(file_name.to_lowercase())
313            .or_default()
314            .insert(file_path.to_path_buf());
315
316        // Index by extension
317        if !extension.is_empty() {
318            self.file_index
319                .entry(format!("*.{}", extension.to_lowercase()))
320                .or_default()
321                .insert(file_path.to_path_buf());
322        }
323
324        // Index by full path components
325        for component in file_path.components() {
326            if let Some(component_str) = component.as_os_str().to_str() {
327                self.file_index
328                    .entry(component_str.to_lowercase())
329                    .or_default()
330                    .insert(file_path.to_path_buf());
331            }
332        }
333    }
334
335    /// Remove file pattern from index
336    fn remove_file_pattern(&self, file_path: &Path) {
337        let file_name = file_path
338            .file_name()
339            .and_then(|name| name.to_str())
340            .unwrap_or("");
341
342        let extension = file_path
343            .extension()
344            .and_then(|ext| ext.to_str())
345            .unwrap_or("");
346
347        // Remove from filename index
348        if let Some(mut file_set) = self.file_index.get_mut(&file_name.to_lowercase()) {
349            file_set.remove(file_path);
350            if file_set.is_empty() {
351                drop(file_set);
352                self.file_index.remove(&file_name.to_lowercase());
353            }
354        }
355
356        // Remove from extension index
357        if !extension.is_empty() {
358            let ext_key = format!("*.{}", extension.to_lowercase());
359            if let Some(mut ext_set) = self.file_index.get_mut(&ext_key) {
360                ext_set.remove(file_path);
361                if ext_set.is_empty() {
362                    drop(ext_set);
363                    self.file_index.remove(&ext_key);
364                }
365            }
366        }
367    }
368
369    /// Search by token matching
370    fn search_by_tokens(&self, query: &str, _search_query: &SearchQuery) -> Result<Vec<ChunkId>> {
371        let query_tokens: Vec<String> = query
372            .to_lowercase()
373            .split_whitespace()
374            .map(|s| s.to_string())
375            .collect();
376
377        if query_tokens.is_empty() {
378            return Ok(Vec::new());
379        }
380
381        let mut result_chunks: Option<HashSet<ChunkId>> = None;
382
383        // Find intersection of chunks containing all query tokens
384        for token in &query_tokens {
385            if let Some(chunk_set) = self.token_index.get(token) {
386                let chunk_ids: HashSet<ChunkId> = chunk_set.iter().copied().collect();
387                result_chunks = Some(match result_chunks {
388                    None => chunk_ids,
389                    Some(existing) => existing.intersection(&chunk_ids).copied().collect(),
390                });
391            } else {
392                // If any token is not found, no results
393                return Ok(Vec::new());
394            }
395        }
396
397        Ok(result_chunks.unwrap_or_default().into_iter().collect())
398    }
399
400    /// Search by regex pattern
401    fn search_by_regex(&self, regex: &Regex, search_query: &SearchQuery) -> Result<Vec<ChunkId>> {
402        let mut matching_chunks = Vec::new();
403
404        for entry in self.chunks.iter() {
405            let chunk = entry.value();
406            let content = if search_query.case_sensitive {
407                &chunk.content
408            } else {
409                &chunk.content.to_lowercase()
410            };
411
412            if regex.is_match(content) {
413                matching_chunks.push(chunk.id);
414            }
415        }
416
417        Ok(matching_chunks)
418    }
419
420    /// Find matches within a chunk
421    fn find_matches_in_chunk(
422        &self,
423        chunk: &ContentChunk,
424        query: &SearchQuery,
425        regex: &Option<Regex>,
426    ) -> Result<Vec<SearchMatch>> {
427        let mut matches = Vec::new();
428        let content = if query.case_sensitive {
429            chunk.content.clone()
430        } else {
431            chunk.content.to_lowercase()
432        };
433
434        let search_term = if query.case_sensitive {
435            query.query.clone()
436        } else {
437            query.query.to_lowercase()
438        };
439
440        if let Some(regex) = regex {
441            // Regex search
442            for regex_match in regex.find_iter(&content) {
443                let line_info = self.calculate_line_info(&content, regex_match.start());
444                let search_match = SearchMatch {
445                    text: regex_match.as_str().to_string(),
446                    position: regex_match.start(),
447                    line_number: line_info.0,
448                    column_number: line_info.1,
449                    context_before: if query.include_context {
450                        self.get_context_before(&content, regex_match.start(), query.context_lines)
451                    } else {
452                        None
453                    },
454                    context_after: if query.include_context {
455                        self.get_context_after(&content, regex_match.end(), query.context_lines)
456                    } else {
457                        None
458                    },
459                };
460                matches.push(search_match);
461            }
462        } else {
463            // Simple text search
464            let mut start = 0;
465            while let Some(pos) = content[start..].find(&search_term) {
466                let absolute_pos = start + pos;
467                let line_info = self.calculate_line_info(&content, absolute_pos);
468                let search_match = SearchMatch {
469                    text: search_term.clone(),
470                    position: absolute_pos,
471                    line_number: line_info.0,
472                    column_number: line_info.1,
473                    context_before: if query.include_context {
474                        self.get_context_before(&content, absolute_pos, query.context_lines)
475                    } else {
476                        None
477                    },
478                    context_after: if query.include_context {
479                        self.get_context_after(
480                            &content,
481                            absolute_pos + search_term.len(),
482                            query.context_lines,
483                        )
484                    } else {
485                        None
486                    },
487                };
488                matches.push(search_match);
489                start = absolute_pos + 1;
490            }
491        }
492
493        Ok(matches)
494    }
495
496    /// Calculate line and column information for a position
497    fn calculate_line_info(&self, content: &str, position: usize) -> (usize, usize) {
498        let before_position = &content[..position.min(content.len())];
499        let line_number = before_position.lines().count();
500        let column_number = before_position
501            .lines()
502            .last()
503            .map(|line| line.len() + 1)
504            .unwrap_or(1);
505        (line_number, column_number)
506    }
507
508    /// Get context lines before a position
509    fn get_context_before(
510        &self,
511        content: &str,
512        position: usize,
513        context_lines: usize,
514    ) -> Option<String> {
515        if context_lines == 0 {
516            return None;
517        }
518
519        let lines: Vec<&str> = content.lines().collect();
520        let (line_number, _) = self.calculate_line_info(content, position);
521
522        if line_number == 0 {
523            return None;
524        }
525
526        let start_line = line_number.saturating_sub(context_lines + 1);
527        let end_line = line_number.saturating_sub(1);
528
529        if start_line >= lines.len() || end_line >= lines.len() || start_line > end_line {
530            return None;
531        }
532
533        Some(lines[start_line..=end_line].join("\n"))
534    }
535
536    /// Get context lines after a position
537    fn get_context_after(
538        &self,
539        content: &str,
540        position: usize,
541        context_lines: usize,
542    ) -> Option<String> {
543        if context_lines == 0 {
544            return None;
545        }
546
547        let lines: Vec<&str> = content.lines().collect();
548        let (line_number, _) = self.calculate_line_info(content, position);
549
550        let start_line = line_number;
551        let end_line = (start_line + context_lines).min(lines.len().saturating_sub(1));
552
553        if start_line >= lines.len() || start_line > end_line {
554            return None;
555        }
556
557        Some(lines[start_line..=end_line].join("\n"))
558    }
559
560    /// Calculate relevance score for a search result
561    fn calculate_relevance_score(
562        &self,
563        chunk: &ContentChunk,
564        matches: &[SearchMatch],
565        _query: &SearchQuery,
566    ) -> f32 {
567        if matches.is_empty() {
568            return 0.0;
569        }
570
571        // Base score from content type relevance (0.2-0.8)
572        let type_score = match &chunk.content_type {
573            ContentType::Documentation { .. } => 0.8,
574            ContentType::Comment { context, .. } => match context {
575                super::CommentContext::Documentation => 0.7,
576                super::CommentContext::Function { .. } => 0.6,
577                super::CommentContext::Class { .. } => 0.6,
578                _ => 0.4,
579            },
580            ContentType::Code { .. } => 0.5,
581            ContentType::Configuration { .. } => 0.4,
582            ContentType::PlainText => 0.2,
583        };
584
585        // Match frequency bonus (0.1 per match)
586        let match_bonus = matches.len() as f32 * 0.1;
587
588        // Calculate final score and normalize to 0.0-1.0 range
589        (type_score + match_bonus).min(1.0)
590    }
591
592    /// Check if content type matches query filters
593    fn matches_content_type(
594        &self,
595        content_type: &ContentType,
596        allowed_types: &[ContentType],
597    ) -> bool {
598        allowed_types
599            .iter()
600            .any(|allowed| std::mem::discriminant(content_type) == std::mem::discriminant(allowed))
601    }
602
603    /// Check if file path matches include/exclude patterns
604    fn matches_file_patterns(
605        &self,
606        file_path: &Path,
607        include_patterns: &[String],
608        exclude_patterns: &[String],
609    ) -> Result<bool> {
610        let path_str = file_path.to_string_lossy();
611
612        // Check exclude patterns first
613        for pattern in exclude_patterns {
614            let regex_pattern = self.glob_to_regex(pattern);
615            let regex = Regex::new(&regex_pattern)?;
616            if regex.is_match(&path_str) {
617                return Ok(false);
618            }
619        }
620
621        // If no include patterns, include by default
622        if include_patterns.is_empty() {
623            return Ok(true);
624        }
625
626        // Check include patterns
627        for pattern in include_patterns {
628            let regex_pattern = self.glob_to_regex(pattern);
629            let regex = Regex::new(&regex_pattern)?;
630            if regex.is_match(&path_str) {
631                return Ok(true);
632            }
633        }
634
635        Ok(false)
636    }
637
638    /// Convert glob pattern to regex pattern
639    fn glob_to_regex(&self, glob: &str) -> String {
640        let mut regex = String::new();
641        regex.push('^');
642
643        for ch in glob.chars() {
644            match ch {
645                '*' => regex.push_str(".*"),
646                '?' => regex.push('.'),
647                '.' => regex.push_str("\\."),
648                '+' => regex.push_str("\\+"),
649                '^' => regex.push_str("\\^"),
650                '$' => regex.push_str("\\$"),
651                '(' => regex.push_str("\\("),
652                ')' => regex.push_str("\\)"),
653                '[' => regex.push_str("\\["),
654                ']' => regex.push_str("\\]"),
655                '{' => regex.push_str("\\{"),
656                '}' => regex.push_str("\\}"),
657                '|' => regex.push_str("\\|"),
658                '\\' => regex.push_str("\\\\"),
659                c => regex.push(c),
660            }
661        }
662
663        regex.push('$');
664        regex
665    }
666
667    /// Convert content type to string for indexing
668    fn content_type_to_string(&self, content_type: &ContentType) -> String {
669        match content_type {
670            ContentType::Code { language } => format!("code:{:?}", language),
671            ContentType::Documentation { format } => format!("doc:{:?}", format),
672            ContentType::Configuration { format } => format!("config:{:?}", format),
673            ContentType::Comment { language, context } => {
674                format!("comment:{:?}:{:?}", language, context)
675            }
676            ContentType::PlainText => "text".to_string(),
677        }
678    }
679
680    /// Compute fresh statistics
681    fn compute_stats(&self) -> ContentStats {
682        let mut stats = ContentStats::new();
683
684        stats.total_files = self.nodes.len();
685        stats.total_chunks = self.chunks.len();
686
687        // Count unique tokens
688        stats.total_tokens = self.token_index.len();
689
690        // Count content by type
691        for entry in self.type_index.iter() {
692            let type_name = entry.key().clone();
693            let chunk_count = entry.value().len();
694            stats.content_by_type.insert(type_name, chunk_count);
695        }
696
697        // File size distribution
698        for entry in self.nodes.iter() {
699            let node = entry.value();
700            let size_bucket = match node.file_size {
701                0..=1024 => "small (0-1KB)",
702                1025..=10240 => "medium (1-10KB)",
703                10241..=102400 => "large (10-100KB)",
704                _ => "very_large (>100KB)",
705            };
706            *stats
707                .size_distribution
708                .entry(size_bucket.to_string())
709                .or_insert(0) += 1;
710        }
711
712        stats.computed_at = SystemTime::now();
713        stats
714    }
715
716    /// Notify update listeners
717    fn notify_update(&self, update: ContentUpdate) {
718        if let Ok(listeners) = self.update_listeners.read() {
719            for listener in listeners.iter() {
720                listener.on_content_update(&update);
721            }
722        }
723    }
724}
725
726impl Default for ContentIndex {
727    fn default() -> Self {
728        Self::new()
729    }
730}
731
732/// Trait for content update listeners
733pub trait ContentUpdateListener: Send + Sync {
734    /// Called when content is updated
735    fn on_content_update(&self, update: &ContentUpdate);
736}
737
738/// Simple logging update listener
739pub struct LoggingUpdateListener;
740
741impl ContentUpdateListener for LoggingUpdateListener {
742    fn on_content_update(&self, update: &ContentUpdate) {
743        eprintln!(
744            "Content updated: {:?} at {:?}",
745            update.file_path, update.timestamp
746        );
747    }
748}
749
750#[cfg(test)]
751mod tests {
752    use super::*;
753    use crate::ast::Span;
754    use crate::content::ChunkId;
755    use crate::{ConfigFormat, DocumentFormat};
756    use std::path::Path;
757
758    fn create_test_chunk(
759        file_path: &Path,
760        content: &str,
761        content_type: ContentType,
762        chunk_index: usize,
763    ) -> ContentChunk {
764        let span = Span::new(0, content.len(), 1, 1, 1, content.len());
765        ContentChunk::new(
766            file_path.to_path_buf(),
767            content_type,
768            content.to_string(),
769            span,
770            chunk_index,
771        )
772    }
773
774    fn create_test_node(file_path: &Path, chunks: Vec<ContentChunk>) -> ContentNode {
775        let mut node = ContentNode::new(file_path.to_path_buf(), chunks[0].content_type.clone());
776        for chunk in chunks {
777            node.add_chunk(chunk);
778        }
779        node.file_size = 1000; // Dummy size
780        node
781    }
782
783    #[test]
784    fn test_content_index_creation() {
785        let index = ContentIndex::new();
786
787        // Test default implementation
788        let _index_default = ContentIndex::default();
789
790        // Initially empty
791        let stats = index.get_stats();
792        assert_eq!(stats.total_files, 0);
793        assert_eq!(stats.total_chunks, 0);
794    }
795
796    #[test]
797    fn test_add_and_get_node() {
798        let index = ContentIndex::new();
799        let file_path = Path::new("test.md");
800
801        // Create test content
802        let chunk = create_test_chunk(
803            file_path,
804            "# Test Document\n\nThis is a test.",
805            ContentType::Documentation {
806                format: DocumentFormat::Markdown,
807            },
808            0,
809        );
810        let node = create_test_node(file_path, vec![chunk]);
811
812        // Add node to index
813        let result = index.add_node(node.clone());
814        assert!(result.is_ok());
815
816        // Retrieve the node
817        let retrieved_node = index.get_node(file_path);
818        assert!(retrieved_node.is_some());
819        let retrieved_node = retrieved_node.unwrap();
820        assert_eq!(retrieved_node.file_path, file_path);
821        assert_eq!(retrieved_node.chunks.len(), 1);
822    }
823
824    #[test]
825    fn test_add_node_replaces_existing() {
826        let index = ContentIndex::new();
827        let file_path = Path::new("test.md");
828
829        // Add first version
830        let chunk1 = create_test_chunk(
831            file_path,
832            "Original content",
833            ContentType::Documentation {
834                format: DocumentFormat::Markdown,
835            },
836            0,
837        );
838        let node1 = create_test_node(file_path, vec![chunk1]);
839        let _ = index.add_node(node1);
840
841        // Add updated version
842        let chunk2 = create_test_chunk(
843            file_path,
844            "Updated content",
845            ContentType::Documentation {
846                format: DocumentFormat::Markdown,
847            },
848            1,
849        );
850        let node2 = create_test_node(file_path, vec![chunk2]);
851        let _ = index.add_node(node2);
852
853        // Should have the updated content
854        let retrieved_node = index.get_node(file_path).unwrap();
855        assert_eq!(retrieved_node.chunks[0].content, "Updated content");
856    }
857
858    #[test]
859    fn test_remove_node() {
860        let index = ContentIndex::new();
861        let file_path = Path::new("test.md");
862
863        // Add a node
864        let chunk = create_test_chunk(
865            file_path,
866            "Test content",
867            ContentType::Documentation {
868                format: DocumentFormat::Markdown,
869            },
870            0,
871        );
872        let node = create_test_node(file_path, vec![chunk]);
873        let _ = index.add_node(node);
874
875        // Verify it exists
876        assert!(index.get_node(file_path).is_some());
877
878        // Remove it
879        let result = index.remove_node(file_path);
880        assert!(result.is_ok());
881
882        // Verify it's gone
883        assert!(index.get_node(file_path).is_none());
884    }
885
886    #[test]
887    fn test_get_chunk() {
888        let index = ContentIndex::new();
889        let file_path = Path::new("test.md");
890
891        let chunk = create_test_chunk(
892            file_path,
893            "Test content",
894            ContentType::Documentation {
895                format: DocumentFormat::Markdown,
896            },
897            42,
898        );
899        let chunk_id = chunk.id;
900        let node = create_test_node(file_path, vec![chunk]);
901
902        let _ = index.add_node(node);
903
904        // Should be able to retrieve chunk by ID
905        let retrieved_chunk = index.get_chunk(&chunk_id);
906        assert!(retrieved_chunk.is_some());
907        assert_eq!(retrieved_chunk.unwrap().content, "Test content");
908
909        // Non-existent chunk should return None
910        let fake_chunk_id = ChunkId::new(Path::new("nonexistent.md"), 9999, &[0u8; 32]);
911        let non_existent = index.get_chunk(&fake_chunk_id);
912        assert!(non_existent.is_none());
913    }
914
915    #[test]
916    fn test_simple_text_search() {
917        let index = ContentIndex::new();
918
919        // Add some test content
920        let file1 = Path::new("doc1.md");
921        let chunk1 = create_test_chunk(
922            file1,
923            "This is a test document about programming",
924            ContentType::Documentation {
925                format: DocumentFormat::Markdown,
926            },
927            1,
928        );
929        let node1 = create_test_node(file1, vec![chunk1]);
930        let _ = index.add_node(node1);
931
932        let file2 = Path::new("doc2.md");
933        let chunk2 = create_test_chunk(
934            file2,
935            "Another document for testing purposes",
936            ContentType::Documentation {
937                format: DocumentFormat::Markdown,
938            },
939            2,
940        );
941        let node2 = create_test_node(file2, vec![chunk2]);
942        let _ = index.add_node(node2);
943
944        // Search for "document" (which should be in both)
945        let search_query = SearchQuery {
946            query: "document".to_string(),
947            max_results: 10,
948            ..Default::default()
949        };
950
951        let results = index.search(&search_query).unwrap();
952        assert!(!results.is_empty());
953
954        // Should find matches in both documents
955        let result_contents: Vec<_> = results.iter().map(|r| &r.chunk.content).collect();
956        assert!(result_contents
957            .iter()
958            .any(|content| content.contains("programming")));
959        assert!(result_contents
960            .iter()
961            .any(|content| content.contains("testing")));
962    }
963
964    #[test]
965    fn test_regex_search() {
966        let index = ContentIndex::new();
967
968        // Add content with email addresses
969        let file_path = Path::new("contacts.md");
970        let chunk = create_test_chunk(
971            file_path,
972            "Contact John at john@example.com or Mary at mary@test.org",
973            ContentType::Documentation {
974                format: DocumentFormat::Markdown,
975            },
976            1,
977        );
978        let node = create_test_node(file_path, vec![chunk]);
979        let _ = index.add_node(node);
980
981        // Search with regex pattern
982        let search_query = SearchQuery {
983            query: r"\b\w+@\w+\.\w+\b".to_string(),
984            use_regex: true,
985            max_results: 10,
986            ..Default::default()
987        };
988
989        let results = index.search(&search_query).unwrap();
990        assert!(!results.is_empty());
991
992        // Should find email matches
993        let result = &results[0];
994        assert!(!result.matches.is_empty());
995    }
996
997    #[test]
998    fn test_search_with_content_type_filter() {
999        let index = ContentIndex::new();
1000
1001        // Add different content types
1002        let md_file = Path::new("doc.md");
1003        let md_chunk = create_test_chunk(
1004            md_file,
1005            "Documentation content",
1006            ContentType::Documentation {
1007                format: DocumentFormat::Markdown,
1008            },
1009            1,
1010        );
1011        let md_node = create_test_node(md_file, vec![md_chunk]);
1012        let _ = index.add_node(md_node);
1013
1014        let json_file = Path::new("config.json");
1015        let json_chunk = create_test_chunk(
1016            json_file,
1017            r#"{"config": "content"}"#,
1018            ContentType::Configuration {
1019                format: ConfigFormat::Json,
1020            },
1021            2,
1022        );
1023        let json_node = create_test_node(json_file, vec![json_chunk]);
1024        let _ = index.add_node(json_node);
1025
1026        // Search only in documentation
1027        let search_query = SearchQuery {
1028            query: "content".to_string(),
1029            content_types: vec![ContentType::Documentation {
1030                format: DocumentFormat::Markdown,
1031            }],
1032            max_results: 10,
1033            ..Default::default()
1034        };
1035
1036        let results = index.search(&search_query).unwrap();
1037        assert_eq!(results.len(), 1);
1038        assert!(results[0].chunk.content.contains("Documentation"));
1039    }
1040
1041    #[test]
1042    fn test_search_with_file_patterns() {
1043        let index = ContentIndex::new();
1044
1045        // Add files with different extensions
1046        let md_file = Path::new("test.md");
1047        let md_chunk = create_test_chunk(
1048            md_file,
1049            "Markdown content",
1050            ContentType::Documentation {
1051                format: DocumentFormat::Markdown,
1052            },
1053            1,
1054        );
1055        let md_node = create_test_node(md_file, vec![md_chunk]);
1056        let _ = index.add_node(md_node);
1057
1058        let txt_file = Path::new("test.txt");
1059        let txt_chunk = create_test_chunk(
1060            txt_file,
1061            "Text content",
1062            ContentType::Documentation {
1063                format: DocumentFormat::PlainText,
1064            },
1065            2,
1066        );
1067        let txt_node = create_test_node(txt_file, vec![txt_chunk]);
1068        let _ = index.add_node(txt_node);
1069
1070        // Search only in .md files
1071        let search_query = SearchQuery {
1072            query: "content".to_string(),
1073            file_patterns: vec!["*.md".to_string()],
1074            max_results: 10,
1075            ..Default::default()
1076        };
1077
1078        let results = index.search(&search_query).unwrap();
1079        assert_eq!(results.len(), 1);
1080        assert!(results[0].chunk.content.contains("Markdown"));
1081    }
1082
1083    #[test]
1084    fn test_search_with_exclude_patterns() {
1085        let index = ContentIndex::new();
1086
1087        // Add test files
1088        let md_file = Path::new("test.md");
1089        let md_chunk = create_test_chunk(
1090            md_file,
1091            "Markdown content",
1092            ContentType::Documentation {
1093                format: DocumentFormat::Markdown,
1094            },
1095            1,
1096        );
1097        let md_node = create_test_node(md_file, vec![md_chunk]);
1098        let _ = index.add_node(md_node);
1099
1100        let tmp_file = Path::new("temp.tmp");
1101        let tmp_chunk = create_test_chunk(
1102            tmp_file,
1103            "Temporary content",
1104            ContentType::Documentation {
1105                format: DocumentFormat::PlainText,
1106            },
1107            2,
1108        );
1109        let tmp_node = create_test_node(tmp_file, vec![tmp_chunk]);
1110        let _ = index.add_node(tmp_node);
1111
1112        // Search excluding .tmp files
1113        let search_query = SearchQuery {
1114            query: "content".to_string(),
1115            exclude_patterns: vec!["*.tmp".to_string()],
1116            max_results: 10,
1117            ..Default::default()
1118        };
1119
1120        let results = index.search(&search_query).unwrap();
1121        assert_eq!(results.len(), 1);
1122        assert!(results[0].chunk.content.contains("Markdown"));
1123    }
1124
1125    #[test]
1126    fn test_search_with_context() {
1127        let index = ContentIndex::new();
1128
1129        let file_path = Path::new("test.md");
1130        let content = "Line 1\nLine 2 with target\nLine 3\nLine 4";
1131        let chunk = create_test_chunk(
1132            file_path,
1133            content,
1134            ContentType::Documentation {
1135                format: DocumentFormat::Markdown,
1136            },
1137            1,
1138        );
1139        let node = create_test_node(file_path, vec![chunk]);
1140        let _ = index.add_node(node);
1141
1142        // Search with context
1143        let search_query = SearchQuery {
1144            query: "target".to_string(),
1145            include_context: true,
1146            context_lines: 1,
1147            max_results: 10,
1148            ..Default::default()
1149        };
1150
1151        let results = index.search(&search_query).unwrap();
1152        assert!(!results.is_empty());
1153
1154        let result = &results[0];
1155        assert!(!result.matches.is_empty());
1156
1157        // Should have context before and after
1158        let search_match = &result.matches[0];
1159        assert!(search_match.context_before.is_some());
1160        assert!(search_match.context_after.is_some());
1161    }
1162
1163    #[test]
1164    fn test_search_case_sensitive() {
1165        let index = ContentIndex::new();
1166
1167        let file_path = Path::new("test.md");
1168        let chunk = create_test_chunk(
1169            file_path,
1170            "Test with UPPERCASE and lowercase",
1171            ContentType::Documentation {
1172                format: DocumentFormat::Markdown,
1173            },
1174            1,
1175        );
1176        let node = create_test_node(file_path, vec![chunk]);
1177        let _ = index.add_node(node);
1178
1179        // Case sensitive search
1180        let search_query = SearchQuery {
1181            query: "UPPERCASE".to_string(),
1182            case_sensitive: true,
1183            max_results: 10,
1184            ..Default::default()
1185        };
1186
1187        let results = index.search(&search_query).unwrap();
1188        assert!(!results.is_empty());
1189
1190        // Should not match lowercase
1191        let search_query_lower = SearchQuery {
1192            query: "uppercase".to_string(),
1193            case_sensitive: true,
1194            max_results: 10,
1195            ..Default::default()
1196        };
1197
1198        let results_lower = index.search(&search_query_lower).unwrap();
1199        assert!(results_lower.is_empty());
1200    }
1201
1202    #[test]
1203    fn test_search_max_results() {
1204        let index = ContentIndex::new();
1205
1206        // Add multiple documents with the same term
1207        for i in 0..10 {
1208            let file_path = PathBuf::from(format!("doc{}.md", i));
1209            let chunk = create_test_chunk(
1210                &file_path,
1211                &format!("Document {} contains the search term", i),
1212                ContentType::Documentation {
1213                    format: DocumentFormat::Markdown,
1214                },
1215                i,
1216            );
1217            let node = create_test_node(&file_path, vec![chunk]);
1218            let _ = index.add_node(node);
1219        }
1220
1221        // Search with max results limit
1222        let search_query = SearchQuery {
1223            query: "search".to_string(),
1224            max_results: 3,
1225            ..Default::default()
1226        };
1227
1228        let results = index.search(&search_query).unwrap();
1229        assert_eq!(results.len(), 3);
1230    }
1231
1232    #[test]
1233    fn test_find_files() {
1234        let index = ContentIndex::new();
1235
1236        // Add files with different patterns
1237        let files = ["test_one.md", "test_two.md", "other.txt", "config.json"];
1238        for (i, file_name) in files.iter().enumerate() {
1239            let file_path = Path::new(file_name);
1240            let chunk = create_test_chunk(
1241                file_path,
1242                &format!("Content {}", i),
1243                ContentType::Documentation {
1244                    format: DocumentFormat::Markdown,
1245                },
1246                i,
1247            );
1248            let node = create_test_node(file_path, vec![chunk]);
1249            let _ = index.add_node(node);
1250        }
1251
1252        // Find markdown files
1253        let md_files = index.find_files(r"\.md$").unwrap();
1254        assert_eq!(md_files.len(), 2);
1255
1256        // Find test files
1257        let test_files = index.find_files(r"test_").unwrap();
1258        assert_eq!(test_files.len(), 2);
1259
1260        // Find all files
1261        let all_files = index.find_files(r".*").unwrap();
1262        assert_eq!(all_files.len(), 4);
1263    }
1264
1265    #[test]
1266    fn test_content_stats() {
1267        let index = ContentIndex::new();
1268
1269        // Initially empty
1270        let stats = index.get_stats();
1271        assert_eq!(stats.total_files, 0);
1272        assert_eq!(stats.total_chunks, 0);
1273
1274        // Add some content
1275        let file1 = Path::new("doc1.md");
1276        let chunk1 = create_test_chunk(
1277            file1,
1278            "First document",
1279            ContentType::Documentation {
1280                format: DocumentFormat::Markdown,
1281            },
1282            1,
1283        );
1284        let node1 = create_test_node(file1, vec![chunk1]);
1285        let _ = index.add_node(node1);
1286
1287        let file2 = Path::new("doc2.md");
1288        let chunk2a = create_test_chunk(
1289            file2,
1290            "Second document first chunk",
1291            ContentType::Documentation {
1292                format: DocumentFormat::Markdown,
1293            },
1294            2,
1295        );
1296        let chunk2b = create_test_chunk(
1297            file2,
1298            "Second document second chunk",
1299            ContentType::Documentation {
1300                format: DocumentFormat::Markdown,
1301            },
1302            3,
1303        );
1304        let node2 = create_test_node(file2, vec![chunk2a, chunk2b]);
1305        let _ = index.add_node(node2);
1306
1307        // Check updated stats
1308        let stats = index.get_stats();
1309        assert_eq!(stats.total_files, 2);
1310        assert_eq!(stats.total_chunks, 3);
1311    }
1312
1313    #[test]
1314    fn test_content_update_listeners() {
1315        struct TestListener {
1316            updates: Arc<std::sync::Mutex<Vec<ContentUpdate>>>,
1317        }
1318
1319        impl ContentUpdateListener for TestListener {
1320            fn on_content_update(&self, update: &ContentUpdate) {
1321                self.updates.lock().unwrap().push(update.clone());
1322            }
1323        }
1324
1325        let index = ContentIndex::new();
1326        let updates = Arc::new(std::sync::Mutex::new(Vec::new()));
1327        let listener = TestListener {
1328            updates: updates.clone(),
1329        };
1330
1331        index.add_update_listener(Box::new(listener));
1332
1333        // Add a node
1334        let file_path = Path::new("test.md");
1335        let chunk = create_test_chunk(
1336            file_path,
1337            "Test content",
1338            ContentType::Documentation {
1339                format: DocumentFormat::Markdown,
1340            },
1341            1,
1342        );
1343        let node = create_test_node(file_path, vec![chunk]);
1344        let _ = index.add_node(node);
1345
1346        // Should have received update notification
1347        let updates = updates.lock().unwrap();
1348        assert_eq!(updates.len(), 1);
1349        assert_eq!(updates[0].file_path, file_path);
1350        assert!(matches!(
1351            updates[0].update_kind,
1352            ContentUpdateKind::Modified
1353        ));
1354    }
1355
1356    #[test]
1357    fn test_clear() {
1358        let index = ContentIndex::new();
1359
1360        // Add some content
1361        let file_path = Path::new("test.md");
1362        let chunk = create_test_chunk(
1363            file_path,
1364            "Test content",
1365            ContentType::Documentation {
1366                format: DocumentFormat::Markdown,
1367            },
1368            1,
1369        );
1370        let node = create_test_node(file_path, vec![chunk]);
1371        let _ = index.add_node(node);
1372
1373        // Verify content exists
1374        assert!(index.get_node(file_path).is_some());
1375        let stats = index.get_stats();
1376        assert!(stats.total_files > 0);
1377
1378        // Clear all content
1379        index.clear();
1380
1381        // Verify content is gone
1382        assert!(index.get_node(file_path).is_none());
1383        let stats = index.get_stats();
1384        assert_eq!(stats.total_files, 0);
1385        assert_eq!(stats.total_chunks, 0);
1386    }
1387
1388    #[test]
1389    fn test_invalid_regex_search() {
1390        let index = ContentIndex::new();
1391
1392        // Add some content
1393        let file_path = Path::new("test.md");
1394        let chunk = create_test_chunk(
1395            file_path,
1396            "Test content",
1397            ContentType::Documentation {
1398                format: DocumentFormat::Markdown,
1399            },
1400            1,
1401        );
1402        let node = create_test_node(file_path, vec![chunk]);
1403        let _ = index.add_node(node);
1404
1405        // Try search with invalid regex
1406        let search_query = SearchQuery {
1407            query: "[invalid".to_string(),
1408            use_regex: true,
1409            max_results: 10,
1410            ..Default::default()
1411        };
1412
1413        let result = index.search(&search_query);
1414        assert!(result.is_err());
1415    }
1416
1417    #[test]
1418    fn test_logging_update_listener() {
1419        let listener = LoggingUpdateListener;
1420        let update = ContentUpdate {
1421            file_path: PathBuf::from("test.md"),
1422            update_kind: ContentUpdateKind::Modified,
1423            timestamp: SystemTime::now(),
1424        };
1425
1426        // Should not panic
1427        listener.on_content_update(&update);
1428    }
1429
1430    #[test]
1431    fn test_line_info_calculation() {
1432        let index = ContentIndex::new();
1433
1434        let content = "Line 1\nLine 2\nLine 3 with text\nLine 4";
1435        let position = content.find("text").unwrap();
1436
1437        let (line, column) = index.calculate_line_info(content, position);
1438        assert_eq!(line, 3); // Line number (1-indexed)
1439        assert!(column > 1); // Column position
1440    }
1441
1442    #[test]
1443    fn test_context_extraction() {
1444        let index = ContentIndex::new();
1445
1446        let content = "Line 1\nLine 2\nLine 3 target\nLine 4\nLine 5";
1447        let position = content.find("target").unwrap();
1448
1449        // Test context before
1450        let context_before = index.get_context_before(content, position, 1);
1451        assert!(context_before.is_some());
1452        assert!(context_before.unwrap().contains("Line 2"));
1453
1454        // Test context after
1455        let context_after = index.get_context_after(content, position + 6, 1);
1456        assert!(context_after.is_some());
1457        assert!(context_after.unwrap().contains("Line 4"));
1458
1459        // Test with zero context lines
1460        let no_context = index.get_context_before(content, position, 0);
1461        assert!(no_context.is_none());
1462    }
1463
1464    #[test]
1465    fn test_relevance_score_calculation() {
1466        let index = ContentIndex::new();
1467
1468        let file_path = Path::new("test.md");
1469        let chunk = create_test_chunk(
1470            file_path,
1471            "Test document with multiple test occurrences",
1472            ContentType::Documentation {
1473                format: DocumentFormat::Markdown,
1474            },
1475            1,
1476        );
1477
1478        let matches = vec![
1479            SearchMatch {
1480                text: "test".to_string(),
1481                position: 0,
1482                line_number: 1,
1483                column_number: 1,
1484                context_before: None,
1485                context_after: None,
1486            },
1487            SearchMatch {
1488                text: "test".to_string(),
1489                position: 30,
1490                line_number: 1,
1491                column_number: 31,
1492                context_before: None,
1493                context_after: None,
1494            },
1495        ];
1496
1497        let query = SearchQuery {
1498            query: "test".to_string(),
1499            ..Default::default()
1500        };
1501
1502        let score = index.calculate_relevance_score(&chunk, &matches, &query);
1503        assert!(score > 0.0);
1504
1505        // More matches should give higher score
1506        let single_match = vec![matches[0].clone()];
1507        let single_score = index.calculate_relevance_score(&chunk, &single_match, &query);
1508        assert!(score > single_score);
1509    }
1510}
codeprism_core/content/index.rs

codeprism_core/content/
index.rs