1use crate::ast::{Language, NodeId, Span};
8use blake3::Hasher;
9use regex::Regex;
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::path::{Path, PathBuf};
13use std::time::SystemTime;
14
15pub mod extractors;
16pub mod index;
17pub mod parsers;
18pub mod search;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
22pub struct ChunkId([u8; 16]);
23
24impl ChunkId {
25 pub fn new(file_path: &Path, chunk_index: usize, content_hash: &[u8; 32]) -> Self {
27 let mut hasher = Hasher::new();
28 hasher.update(file_path.to_string_lossy().as_bytes());
29 hasher.update(&chunk_index.to_le_bytes());
30 hasher.update(content_hash);
31
32 let hash = hasher.finalize();
33 let mut id = [0u8; 16];
34 id.copy_from_slice(&hash.as_bytes()[..16]);
35 Self(id)
36 }
37
38 pub fn to_hex(&self) -> String {
40 hex::encode(self.0)
41 }
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ContentType {
48 Code {
50 language: Language,
52 },
53 Documentation {
55 format: DocumentFormat,
57 },
58 Configuration {
60 format: ConfigFormat,
62 },
63 Comment {
65 language: Language,
67 context: CommentContext,
69 },
70 PlainText,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
76#[serde(rename_all = "snake_case")]
77pub enum DocumentFormat {
78 Markdown,
80 RestructuredText,
82 AsciiDoc,
84 PlainText,
86 Html,
88}
89
90#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum ConfigFormat {
94 Json,
96 Yaml,
98 Toml,
100 Ini,
102 Properties,
104 Env,
106 Xml,
108}
109
110#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
112#[serde(rename_all = "snake_case")]
113pub enum CommentContext {
114 Function {
116 function_name: String,
118 },
119 Class {
121 class_name: String,
123 },
124 Module,
126 Inline,
128 Block,
130 Documentation,
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct ContentChunk {
137 pub id: ChunkId,
139 pub content_type: ContentType,
141 pub content: String,
143 pub span: Span,
145 pub file_path: PathBuf,
147 pub tokens: Vec<String>,
149 pub related_nodes: Vec<NodeId>,
151 pub last_modified: SystemTime,
153 pub metadata: serde_json::Value,
155}
156
157impl ContentChunk {
158 pub fn new(
160 file_path: PathBuf,
161 content_type: ContentType,
162 content: String,
163 span: Span,
164 chunk_index: usize,
165 ) -> Self {
166 let content_bytes = blake3::hash(content.as_bytes());
167 let id = ChunkId::new(&file_path, chunk_index, content_bytes.as_bytes());
168
169 Self {
170 id,
171 content_type,
172 content: content.clone(),
173 span,
174 file_path,
175 tokens: Self::tokenize_content(&content),
176 related_nodes: Vec::new(),
177 last_modified: SystemTime::now(),
178 metadata: serde_json::Value::Null,
179 }
180 }
181
182 fn tokenize_content(content: &str) -> Vec<String> {
184 let re = Regex::new(r"[^\w]+").unwrap();
186 re.split(content)
187 .filter(|s| !s.is_empty() && s.len() > 1) .map(|s| s.to_lowercase())
189 .collect()
190 }
191
192 pub fn add_related_node(&mut self, node_id: NodeId) {
194 if !self.related_nodes.contains(&node_id) {
195 self.related_nodes.push(node_id);
196 }
197 }
198
199 pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
201 self.metadata = metadata;
202 self
203 }
204}
205
206#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct ContentNode {
209 pub file_path: PathBuf,
211 pub content_type: ContentType,
213 pub chunks: Vec<ContentChunk>,
215 pub ast_nodes: Vec<NodeId>,
217 pub last_indexed: SystemTime,
219 pub file_size: usize,
221 pub is_monitored: bool,
223}
224
225impl ContentNode {
226 pub fn new(file_path: PathBuf, content_type: ContentType) -> Self {
228 Self {
229 file_path,
230 content_type,
231 chunks: Vec::new(),
232 ast_nodes: Vec::new(),
233 last_indexed: SystemTime::now(),
234 file_size: 0,
235 is_monitored: true,
236 }
237 }
238
239 pub fn add_chunk(&mut self, chunk: ContentChunk) {
241 self.chunks.push(chunk);
242 }
243
244 pub fn add_ast_node(&mut self, node_id: NodeId) {
246 if !self.ast_nodes.contains(&node_id) {
247 self.ast_nodes.push(node_id);
248 }
249 }
250
251 pub fn get_all_tokens(&self) -> Vec<String> {
253 let mut all_tokens = Vec::new();
254 for chunk in &self.chunks {
255 all_tokens.extend(chunk.tokens.clone());
256 }
257 all_tokens.sort();
258 all_tokens.dedup();
259 all_tokens
260 }
261
262 pub fn search(&self, query: &str, case_sensitive: bool) -> Vec<&ContentChunk> {
264 let search_query = if case_sensitive {
265 query.to_string()
266 } else {
267 query.to_lowercase()
268 };
269
270 self.chunks
271 .iter()
272 .filter(|chunk| {
273 let content = if case_sensitive {
274 &chunk.content
275 } else {
276 &chunk.content.to_lowercase()
277 };
278 content.contains(&search_query)
279 })
280 .collect()
281 }
282}
283
284#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct ContentStats {
287 pub total_files: usize,
289 pub total_chunks: usize,
291 pub total_tokens: usize,
293 pub content_by_type: HashMap<String, usize>,
295 pub size_distribution: HashMap<String, usize>,
297 pub computed_at: SystemTime,
299}
300
301impl ContentStats {
302 pub fn new() -> Self {
304 Self {
305 total_files: 0,
306 total_chunks: 0,
307 total_tokens: 0,
308 content_by_type: HashMap::new(),
309 size_distribution: HashMap::new(),
310 computed_at: SystemTime::now(),
311 }
312 }
313}
314
315impl Default for ContentStats {
316 fn default() -> Self {
317 Self::new()
318 }
319}
320
321#[derive(Debug, Clone, Serialize, Deserialize)]
323pub struct SearchQuery {
324 pub query: String,
326 pub content_types: Vec<ContentType>,
328 pub file_patterns: Vec<String>,
330 pub exclude_patterns: Vec<String>,
332 pub max_results: usize,
334 pub case_sensitive: bool,
336 pub use_regex: bool,
338 pub include_context: bool,
340 pub context_lines: usize,
342}
343
344impl Default for SearchQuery {
345 fn default() -> Self {
346 Self {
347 query: String::new(),
348 content_types: vec![
349 ContentType::Code {
350 language: Language::Unknown,
351 },
352 ContentType::Documentation {
353 format: DocumentFormat::Markdown,
354 },
355 ContentType::Comment {
356 language: Language::Unknown,
357 context: CommentContext::Block,
358 },
359 ],
360 file_patterns: Vec::new(),
361 exclude_patterns: Vec::new(),
362 max_results: 100,
363 case_sensitive: false,
364 use_regex: false,
365 include_context: true,
366 context_lines: 2,
367 }
368 }
369}
370
371#[derive(Debug, Clone, Serialize, Deserialize)]
373pub struct SearchResult {
374 pub chunk: ContentChunk,
376 pub score: f32,
378 pub matches: Vec<SearchMatch>,
380 pub related_nodes: Vec<NodeId>,
382}
383
384#[derive(Debug, Clone, Serialize, Deserialize)]
386pub struct SearchMatch {
387 pub text: String,
389 pub position: usize,
391 pub line_number: usize,
393 pub column_number: usize,
395 pub context_before: Option<String>,
397 pub context_after: Option<String>,
399}
400
401#[derive(Debug, Clone)]
403pub struct ContentUpdate {
404 pub file_path: PathBuf,
406 pub update_kind: ContentUpdateKind,
408 pub timestamp: SystemTime,
410}
411
412#[derive(Debug, Clone)]
414pub enum ContentUpdateKind {
415 Created,
417 Modified,
419 Deleted,
421 Renamed {
423 old_path: PathBuf,
425 },
426}
427
428#[cfg(test)]
429mod tests {
430 use super::*;
431 use crate::ast::NodeKind;
432
433 #[test]
434 fn test_chunk_id_generation() {
435 let file_path = PathBuf::from("test.md");
436 let content_hash = [0u8; 32];
437
438 let id1 = ChunkId::new(&file_path, 0, &content_hash);
439 let id2 = ChunkId::new(&file_path, 0, &content_hash);
440 let id3 = ChunkId::new(&file_path, 1, &content_hash);
441
442 assert_eq!(id1, id2, "Same inputs should generate same ID");
443 assert_ne!(
444 id1, id3,
445 "Different chunk index should generate different ID"
446 );
447
448 let hex = id1.to_hex();
449 assert_eq!(hex.len(), 32, "Hex string should be 32 characters");
450 assert!(
451 hex.chars().all(|c| c.is_ascii_hexdigit()),
452 "Should be valid hex"
453 );
454 }
455
456 #[test]
457 fn test_content_types_serialization() {
458 let test_cases = vec![
459 ContentType::Code {
460 language: Language::Python,
461 },
462 ContentType::Documentation {
463 format: DocumentFormat::Markdown,
464 },
465 ContentType::Configuration {
466 format: ConfigFormat::Json,
467 },
468 ContentType::Comment {
469 language: Language::JavaScript,
470 context: CommentContext::Function {
471 function_name: "test".to_string(),
472 },
473 },
474 ContentType::PlainText,
475 ];
476
477 for content_type in test_cases {
478 let json = serde_json::to_string(&content_type).unwrap();
479 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
480 assert_eq!(
481 std::mem::discriminant(&content_type),
482 std::mem::discriminant(&deserialized),
483 "Serialization roundtrip failed for: {content_type:?}"
484 );
485 }
486 }
487
488 #[test]
489 fn test_content_chunk_creation() {
490 let file_path = PathBuf::from("test.md");
491 let content_type = ContentType::Documentation {
492 format: DocumentFormat::Markdown,
493 };
494 let content = "# Test Header\nSome content here.".to_string();
495 let span = Span::new(0, content.len(), 1, 2, 1, 19);
496
497 let chunk = ContentChunk::new(
498 file_path.clone(),
499 content_type.clone(),
500 content.clone(),
501 span,
502 0,
503 );
504
505 assert_eq!(chunk.file_path, file_path);
506 assert_eq!(chunk.content, content);
507 assert!(
508 !chunk.tokens.is_empty(),
509 "Should extract tokens from content"
510 );
511 assert!(
512 chunk.tokens.contains(&"test".to_string()),
513 "Should extract 'test' token"
514 );
515 assert!(
516 chunk.tokens.contains(&"header".to_string()),
517 "Should extract 'header' token"
518 );
519 assert!(
520 chunk.tokens.contains(&"content".to_string()),
521 "Should extract 'content' token"
522 );
523 }
524
525 #[test]
526 fn test_content_chunk_tokenization() {
527 let file_path = PathBuf::from("test.py");
528 let content_type = ContentType::Code {
529 language: Language::Python,
530 };
531 let content = "def hello_world():\n print('Hello, World!')".to_string();
532 let span = Span::new(0, content.len(), 1, 2, 1, 26);
533
534 let chunk = ContentChunk::new(file_path, content_type, content, span, 0);
535
536 assert!(chunk.tokens.contains(&"def".to_string()));
537 assert!(chunk.tokens.contains(&"hello".to_string()));
538 assert!(chunk.tokens.contains(&"world".to_string()));
539 assert!(chunk.tokens.contains(&"print".to_string()));
540 assert!(
541 !chunk.tokens.contains(&"(".to_string()),
542 "Should filter out single chars"
543 );
544 }
545
546 #[test]
547 fn test_content_node_operations() {
548 let file_path = PathBuf::from("test.md");
549 let content_type = ContentType::Documentation {
550 format: DocumentFormat::Markdown,
551 };
552 let mut node = ContentNode::new(file_path.clone(), content_type.clone());
553
554 assert_eq!(node.file_path, file_path);
555 assert_eq!(node.chunks.len(), 0, "Should have 0 items");
556 assert_eq!(node.ast_nodes.len(), 0, "Should have 0 items");
557
558 let chunk = ContentChunk::new(
560 file_path.clone(),
561 content_type,
562 "Test content".to_string(),
563 Span::new(0, 12, 1, 1, 1, 13),
564 0,
565 );
566 node.add_chunk(chunk);
567
568 assert_eq!(node.chunks.len(), 1, "Should have 1 items");
569
570 let node_id = NodeId::new(
572 "test",
573 &file_path,
574 &Span::new(0, 5, 1, 1, 1, 6),
575 &NodeKind::Function,
576 );
577 node.add_ast_node(node_id);
578
579 assert_eq!(node.ast_nodes.len(), 1, "Should have 1 items");
580 assert_eq!(node.ast_nodes[0], node_id);
581
582 let tokens = node.get_all_tokens();
584 assert!(tokens.contains(&"test".to_string()));
585 assert!(tokens.contains(&"content".to_string()));
586 }
587
588 #[test]
589 fn test_content_node_search() {
590 let file_path = PathBuf::from("test.md");
591 let content_type = ContentType::Documentation {
592 format: DocumentFormat::Markdown,
593 };
594 let mut node = ContentNode::new(file_path.clone(), content_type.clone());
595
596 let chunk1 = ContentChunk::new(
598 file_path.clone(),
599 content_type.clone(),
600 "First test content".to_string(),
601 Span::new(0, 18, 1, 1, 1, 19),
602 0,
603 );
604 let chunk2 = ContentChunk::new(
605 file_path.clone(),
606 content_type.clone(),
607 "Second example content".to_string(),
608 Span::new(19, 41, 2, 2, 1, 23),
609 1,
610 );
611 node.add_chunk(chunk1);
612 node.add_chunk(chunk2);
613
614 let results = node.search("TEST", false);
616 assert_eq!(results.len(), 1, "Should find 'test' case-insensitively");
617
618 let results = node.search("content", false);
619 assert_eq!(results.len(), 2, "Should find 'content' in both chunks");
620
621 let results = node.search("TEST", true);
623 assert_eq!(results.len(), 0, "Should not find 'TEST' case-sensitively");
624
625 let results = node.search("First", true);
626 assert_eq!(results.len(), 1, "Should find exact case match");
627 }
628
629 #[test]
630 fn test_search_query_default() {
631 let query = SearchQuery::default();
632
633 assert_eq!(query.query, "");
634 assert_eq!(query.max_results, 100);
635 assert!(!query.case_sensitive);
636 assert!(!query.use_regex);
637 assert!(query.include_context);
638 assert_eq!(query.context_lines, 2);
639 assert_eq!(query.content_types.len(), 3, "Should have 3 items");
640 }
641
642 #[test]
643 fn test_search_query_builder() {
644 let query = SearchQuery {
645 query: "test query".to_string(),
646 content_types: vec![ContentType::Code {
647 language: Language::Python,
648 }],
649 file_patterns: vec!["*.py".to_string()],
650 exclude_patterns: vec!["test_*.py".to_string()],
651 max_results: 25,
652 case_sensitive: true,
653 use_regex: true,
654 include_context: false,
655 context_lines: 5,
656 };
657
658 assert_eq!(query.query, "test query");
660 assert_eq!(query.max_results, 25);
661 assert!(query.case_sensitive);
662 assert!(query.use_regex);
663 assert!(!query.include_context);
664 assert_eq!(query.context_lines, 5);
665 assert_eq!(query.file_patterns, vec!["*.py"]);
666 assert_eq!(query.exclude_patterns, vec!["test_*.py"]);
667 }
668
669 #[test]
670 fn test_content_stats_creation() {
671 let mut stats = ContentStats::new();
672
673 assert_eq!(stats.total_files, 0);
674 assert_eq!(stats.total_chunks, 0);
675 assert_eq!(stats.total_tokens, 0);
676 assert!(
677 stats.content_by_type.is_empty(),
678 "Should be empty initially"
679 );
680 assert!(
681 stats.size_distribution.is_empty(),
682 "Should be empty initially"
683 );
684
685 stats.total_files = 10;
687 stats.total_chunks = 50;
688 stats.total_tokens = 1000;
689 stats.content_by_type.insert("code:python".to_string(), 15);
690 stats.size_distribution.insert("small".to_string(), 8);
691
692 assert_eq!(stats.total_files, 10);
693 assert_eq!(stats.total_chunks, 50);
694 assert_eq!(stats.total_tokens, 1000);
695 }
696
697 #[test]
698 fn test_search_result_structure() {
699 let file_path = PathBuf::from("test.md");
700 let chunk = ContentChunk::new(
701 file_path,
702 ContentType::Documentation {
703 format: DocumentFormat::Markdown,
704 },
705 "Test content with query match".to_string(),
706 Span::new(0, 29, 1, 1, 1, 30),
707 0,
708 );
709
710 let search_match = SearchMatch {
711 text: "query".to_string(),
712 position: 18,
713 line_number: 1,
714 column_number: 19,
715 context_before: Some("Test content with ".to_string()),
716 context_after: Some(" match".to_string()),
717 };
718
719 let result = SearchResult {
720 chunk: chunk.clone(),
721 score: 0.85,
722 matches: vec![search_match.clone()],
723 related_nodes: vec![],
724 };
725
726 assert_eq!(result.score, 0.85);
727 assert_eq!(result.matches.len(), 1, "Should have 1 items");
728 assert_eq!(result.matches[0].text, "query");
729 assert_eq!(result.matches[0].position, 18);
730 assert_eq!(result.chunk.content, chunk.content);
731 }
732
733 #[test]
734 fn test_comment_context_variants() {
735 let contexts = vec![
736 CommentContext::Function {
737 function_name: "test_func".to_string(),
738 },
739 CommentContext::Class {
740 class_name: "TestClass".to_string(),
741 },
742 CommentContext::Module,
743 CommentContext::Inline,
744 CommentContext::Block,
745 CommentContext::Documentation,
746 ];
747
748 for context in contexts {
749 let content_type = ContentType::Comment {
750 language: Language::Python,
751 context: context.clone(),
752 };
753
754 let json = serde_json::to_string(&content_type).unwrap();
756 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
757
758 if let ContentType::Comment {
759 context: deserialized_context,
760 ..
761 } = deserialized
762 {
763 assert_eq!(
764 std::mem::discriminant(&context),
765 std::mem::discriminant(&deserialized_context),
766 "Context variant should match after serialization"
767 );
768 } else {
769 panic!("Expected Comment content type");
770 }
771 }
772 }
773
774 #[test]
775 fn test_document_format_variants() {
776 let formats = vec![
777 DocumentFormat::Markdown,
778 DocumentFormat::RestructuredText,
779 DocumentFormat::AsciiDoc,
780 DocumentFormat::PlainText,
781 DocumentFormat::Html,
782 ];
783
784 for format in formats {
785 let content_type = ContentType::Documentation {
786 format: format.clone(),
787 };
788
789 let json = serde_json::to_string(&content_type).unwrap();
791 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
792
793 if let ContentType::Documentation {
794 format: deserialized_format,
795 } = deserialized
796 {
797 assert_eq!(
798 format, deserialized_format,
799 "Format should match after serialization"
800 );
801 } else {
802 panic!("Expected Documentation content type");
803 }
804 }
805 }
806
807 #[test]
808 fn test_config_format_variants() {
809 let formats = vec![
810 ConfigFormat::Json,
811 ConfigFormat::Yaml,
812 ConfigFormat::Toml,
813 ConfigFormat::Ini,
814 ConfigFormat::Properties,
815 ConfigFormat::Env,
816 ConfigFormat::Xml,
817 ];
818
819 for format in formats {
820 let content_type = ContentType::Configuration {
821 format: format.clone(),
822 };
823
824 let json = serde_json::to_string(&content_type).unwrap();
826 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
827
828 if let ContentType::Configuration {
829 format: deserialized_format,
830 } = deserialized
831 {
832 assert_eq!(
833 format, deserialized_format,
834 "Format should match after serialization"
835 );
836 } else {
837 panic!("Expected Configuration content type");
838 }
839 }
840 }
841
842 #[test]
843 fn test_content_update_kinds() {
844 let file_path = PathBuf::from("test.md");
845 let old_path = PathBuf::from("old_test.md");
846
847 let updates = vec![
848 ContentUpdate {
849 file_path: file_path.clone(),
850 update_kind: ContentUpdateKind::Created,
851 timestamp: SystemTime::now(),
852 },
853 ContentUpdate {
854 file_path: file_path.clone(),
855 update_kind: ContentUpdateKind::Modified,
856 timestamp: SystemTime::now(),
857 },
858 ContentUpdate {
859 file_path: file_path.clone(),
860 update_kind: ContentUpdateKind::Deleted,
861 timestamp: SystemTime::now(),
862 },
863 ContentUpdate {
864 file_path: file_path.clone(),
865 update_kind: ContentUpdateKind::Renamed {
866 old_path: old_path.clone(),
867 },
868 timestamp: SystemTime::now(),
869 },
870 ];
871
872 for update in updates {
873 assert_eq!(update.file_path, file_path);
875 assert!(update.timestamp <= SystemTime::now());
876
877 match &update.update_kind {
879 ContentUpdateKind::Created => { }
880 ContentUpdateKind::Modified => { }
881 ContentUpdateKind::Deleted => { }
882 ContentUpdateKind::Renamed {
883 old_path: renamed_old_path,
884 } => {
885 assert_eq!(renamed_old_path, &old_path);
886 }
887 }
888 }
889 }
890}