1use crate::ast::{Language, NodeId, Span};
8use blake3::Hasher;
9use regex::Regex;
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::path::{Path, PathBuf};
13use std::time::SystemTime;
14
15pub mod extractors;
16pub mod index;
17pub mod parsers;
18pub mod search;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
22pub struct ChunkId([u8; 16]);
23
24impl ChunkId {
25 pub fn new(file_path: &Path, chunk_index: usize, content_hash: &[u8; 32]) -> Self {
27 let mut hasher = Hasher::new();
28 hasher.update(file_path.to_string_lossy().as_bytes());
29 hasher.update(&chunk_index.to_le_bytes());
30 hasher.update(content_hash);
31
32 let hash = hasher.finalize();
33 let mut id = [0u8; 16];
34 id.copy_from_slice(&hash.as_bytes()[..16]);
35 Self(id)
36 }
37
38 pub fn to_hex(&self) -> String {
40 hex::encode(self.0)
41 }
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ContentType {
48 Code {
50 language: Language,
52 },
53 Documentation {
55 format: DocumentFormat,
57 },
58 Configuration {
60 format: ConfigFormat,
62 },
63 Comment {
65 language: Language,
67 context: CommentContext,
69 },
70 PlainText,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
76#[serde(rename_all = "snake_case")]
77pub enum DocumentFormat {
78 Markdown,
80 RestructuredText,
82 AsciiDoc,
84 PlainText,
86 Html,
88}
89
90#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
92#[serde(rename_all = "snake_case")]
93pub enum ConfigFormat {
94 Json,
96 Yaml,
98 Toml,
100 Ini,
102 Properties,
104 Env,
106 Xml,
108}
109
110#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
112#[serde(rename_all = "snake_case")]
113pub enum CommentContext {
114 Function {
116 function_name: String,
118 },
119 Class {
121 class_name: String,
123 },
124 Module,
126 Inline,
128 Block,
130 Documentation,
132}
133
134#[derive(Debug, Clone, Serialize, Deserialize)]
136pub struct ContentChunk {
137 pub id: ChunkId,
139 pub content_type: ContentType,
141 pub content: String,
143 pub span: Span,
145 pub file_path: PathBuf,
147 pub tokens: Vec<String>,
149 pub related_nodes: Vec<NodeId>,
151 pub last_modified: SystemTime,
153 pub metadata: serde_json::Value,
155}
156
157impl ContentChunk {
158 pub fn new(
160 file_path: PathBuf,
161 content_type: ContentType,
162 content: String,
163 span: Span,
164 chunk_index: usize,
165 ) -> Self {
166 let content_bytes = blake3::hash(content.as_bytes());
167 let id = ChunkId::new(&file_path, chunk_index, content_bytes.as_bytes());
168
169 Self {
170 id,
171 content_type,
172 content: content.clone(),
173 span,
174 file_path,
175 tokens: Self::tokenize_content(&content),
176 related_nodes: Vec::new(),
177 last_modified: SystemTime::now(),
178 metadata: serde_json::Value::Null,
179 }
180 }
181
182 fn tokenize_content(content: &str) -> Vec<String> {
184 let re = Regex::new(r"[^\w]+").unwrap();
186 re.split(content)
187 .filter(|s| !s.is_empty() && s.len() > 1) .map(|s| s.to_lowercase())
189 .collect()
190 }
191
192 pub fn add_related_node(&mut self, node_id: NodeId) {
194 if !self.related_nodes.contains(&node_id) {
195 self.related_nodes.push(node_id);
196 }
197 }
198
199 pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
201 self.metadata = metadata;
202 self
203 }
204}
205
206#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct ContentNode {
209 pub file_path: PathBuf,
211 pub content_type: ContentType,
213 pub chunks: Vec<ContentChunk>,
215 pub ast_nodes: Vec<NodeId>,
217 pub last_indexed: SystemTime,
219 pub file_size: usize,
221 pub is_monitored: bool,
223}
224
225impl ContentNode {
226 pub fn new(file_path: PathBuf, content_type: ContentType) -> Self {
228 Self {
229 file_path,
230 content_type,
231 chunks: Vec::new(),
232 ast_nodes: Vec::new(),
233 last_indexed: SystemTime::now(),
234 file_size: 0,
235 is_monitored: true,
236 }
237 }
238
239 pub fn add_chunk(&mut self, chunk: ContentChunk) {
241 self.chunks.push(chunk);
242 }
243
244 pub fn add_ast_node(&mut self, node_id: NodeId) {
246 if !self.ast_nodes.contains(&node_id) {
247 self.ast_nodes.push(node_id);
248 }
249 }
250
251 pub fn get_all_tokens(&self) -> Vec<String> {
253 let mut all_tokens = Vec::new();
254 for chunk in &self.chunks {
255 all_tokens.extend(chunk.tokens.clone());
256 }
257 all_tokens.sort();
258 all_tokens.dedup();
259 all_tokens
260 }
261
262 pub fn search(&self, query: &str, case_sensitive: bool) -> Vec<&ContentChunk> {
264 let search_query = if case_sensitive {
265 query.to_string()
266 } else {
267 query.to_lowercase()
268 };
269
270 self.chunks
271 .iter()
272 .filter(|chunk| {
273 let content = if case_sensitive {
274 &chunk.content
275 } else {
276 &chunk.content.to_lowercase()
277 };
278 content.contains(&search_query)
279 })
280 .collect()
281 }
282}
283
284#[derive(Debug, Clone, Serialize, Deserialize)]
286pub struct ContentStats {
287 pub total_files: usize,
289 pub total_chunks: usize,
291 pub total_tokens: usize,
293 pub content_by_type: HashMap<String, usize>,
295 pub size_distribution: HashMap<String, usize>,
297 pub computed_at: SystemTime,
299}
300
301impl ContentStats {
302 pub fn new() -> Self {
304 Self {
305 total_files: 0,
306 total_chunks: 0,
307 total_tokens: 0,
308 content_by_type: HashMap::new(),
309 size_distribution: HashMap::new(),
310 computed_at: SystemTime::now(),
311 }
312 }
313}
314
315impl Default for ContentStats {
316 fn default() -> Self {
317 Self::new()
318 }
319}
320
321#[derive(Debug, Clone, Serialize, Deserialize)]
323pub struct SearchQuery {
324 pub query: String,
326 pub content_types: Vec<ContentType>,
328 pub file_patterns: Vec<String>,
330 pub exclude_patterns: Vec<String>,
332 pub max_results: usize,
334 pub case_sensitive: bool,
336 pub use_regex: bool,
338 pub include_context: bool,
340 pub context_lines: usize,
342}
343
344impl Default for SearchQuery {
345 fn default() -> Self {
346 Self {
347 query: String::new(),
348 content_types: vec![
349 ContentType::Code {
350 language: Language::Unknown,
351 },
352 ContentType::Documentation {
353 format: DocumentFormat::Markdown,
354 },
355 ContentType::Comment {
356 language: Language::Unknown,
357 context: CommentContext::Block,
358 },
359 ],
360 file_patterns: Vec::new(),
361 exclude_patterns: Vec::new(),
362 max_results: 100,
363 case_sensitive: false,
364 use_regex: false,
365 include_context: true,
366 context_lines: 2,
367 }
368 }
369}
370
371#[derive(Debug, Clone, Serialize, Deserialize)]
373pub struct SearchResult {
374 pub chunk: ContentChunk,
376 pub score: f32,
378 pub matches: Vec<SearchMatch>,
380 pub related_nodes: Vec<NodeId>,
382}
383
384#[derive(Debug, Clone, Serialize, Deserialize)]
386pub struct SearchMatch {
387 pub text: String,
389 pub position: usize,
391 pub line_number: usize,
393 pub column_number: usize,
395 pub context_before: Option<String>,
397 pub context_after: Option<String>,
399}
400
401#[derive(Debug, Clone)]
403pub struct ContentUpdate {
404 pub file_path: PathBuf,
406 pub update_kind: ContentUpdateKind,
408 pub timestamp: SystemTime,
410}
411
412#[derive(Debug, Clone)]
414pub enum ContentUpdateKind {
415 Created,
417 Modified,
419 Deleted,
421 Renamed {
423 old_path: PathBuf,
425 },
426}
427
428#[cfg(test)]
429mod tests {
430 use super::*;
431 use crate::ast::NodeKind;
432
433 #[test]
434 fn test_chunk_id_generation() {
435 let file_path = PathBuf::from("test.md");
436 let content_hash = [0u8; 32];
437
438 let id1 = ChunkId::new(&file_path, 0, &content_hash);
439 let id2 = ChunkId::new(&file_path, 0, &content_hash);
440 let id3 = ChunkId::new(&file_path, 1, &content_hash);
441
442 assert_eq!(id1, id2, "Same inputs should generate same ID");
443 assert_ne!(
444 id1, id3,
445 "Different chunk index should generate different ID"
446 );
447
448 let hex = id1.to_hex();
449 assert_eq!(hex.len(), 32, "Hex string should be 32 characters");
450 assert!(
451 hex.chars().all(|c| c.is_ascii_hexdigit()),
452 "Should be valid hex"
453 );
454 }
455
456 #[test]
457 fn test_content_types_serialization() {
458 let test_cases = vec![
459 ContentType::Code {
460 language: Language::Python,
461 },
462 ContentType::Documentation {
463 format: DocumentFormat::Markdown,
464 },
465 ContentType::Configuration {
466 format: ConfigFormat::Json,
467 },
468 ContentType::Comment {
469 language: Language::JavaScript,
470 context: CommentContext::Function {
471 function_name: "test".to_string(),
472 },
473 },
474 ContentType::PlainText,
475 ];
476
477 for content_type in test_cases {
478 let json = serde_json::to_string(&content_type).unwrap();
479 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
480 assert_eq!(
481 std::mem::discriminant(&content_type),
482 std::mem::discriminant(&deserialized),
483 "Serialization roundtrip failed for: {:?}",
484 content_type
485 );
486 }
487 }
488
489 #[test]
490 fn test_content_chunk_creation() {
491 let file_path = PathBuf::from("test.md");
492 let content_type = ContentType::Documentation {
493 format: DocumentFormat::Markdown,
494 };
495 let content = "# Test Header\nSome content here.".to_string();
496 let span = Span::new(0, content.len(), 1, 2, 1, 19);
497
498 let chunk = ContentChunk::new(
499 file_path.clone(),
500 content_type.clone(),
501 content.clone(),
502 span,
503 0,
504 );
505
506 assert_eq!(chunk.file_path, file_path);
507 assert_eq!(chunk.content, content);
508 assert!(
509 !chunk.tokens.is_empty(),
510 "Should extract tokens from content"
511 );
512 assert!(
513 chunk.tokens.contains(&"test".to_string()),
514 "Should extract 'test' token"
515 );
516 assert!(
517 chunk.tokens.contains(&"header".to_string()),
518 "Should extract 'header' token"
519 );
520 assert!(
521 chunk.tokens.contains(&"content".to_string()),
522 "Should extract 'content' token"
523 );
524 }
525
526 #[test]
527 fn test_content_chunk_tokenization() {
528 let file_path = PathBuf::from("test.py");
529 let content_type = ContentType::Code {
530 language: Language::Python,
531 };
532 let content = "def hello_world():\n print('Hello, World!')".to_string();
533 let span = Span::new(0, content.len(), 1, 2, 1, 26);
534
535 let chunk = ContentChunk::new(file_path, content_type, content, span, 0);
536
537 assert!(chunk.tokens.contains(&"def".to_string()));
538 assert!(chunk.tokens.contains(&"hello".to_string()));
539 assert!(chunk.tokens.contains(&"world".to_string()));
540 assert!(chunk.tokens.contains(&"print".to_string()));
541 assert!(
542 !chunk.tokens.contains(&"(".to_string()),
543 "Should filter out single chars"
544 );
545 }
546
547 #[test]
548 fn test_content_node_operations() {
549 let file_path = PathBuf::from("test.md");
550 let content_type = ContentType::Documentation {
551 format: DocumentFormat::Markdown,
552 };
553 let mut node = ContentNode::new(file_path.clone(), content_type.clone());
554
555 assert_eq!(node.file_path, file_path);
556 assert_eq!(node.chunks.len(), 0);
557 assert_eq!(node.ast_nodes.len(), 0);
558
559 let chunk = ContentChunk::new(
561 file_path.clone(),
562 content_type,
563 "Test content".to_string(),
564 Span::new(0, 12, 1, 1, 1, 13),
565 0,
566 );
567 node.add_chunk(chunk);
568
569 assert_eq!(node.chunks.len(), 1);
570
571 let node_id = NodeId::new(
573 "test",
574 &file_path,
575 &Span::new(0, 5, 1, 1, 1, 6),
576 &NodeKind::Function,
577 );
578 node.add_ast_node(node_id);
579
580 assert_eq!(node.ast_nodes.len(), 1);
581 assert_eq!(node.ast_nodes[0], node_id);
582
583 let tokens = node.get_all_tokens();
585 assert!(tokens.contains(&"test".to_string()));
586 assert!(tokens.contains(&"content".to_string()));
587 }
588
589 #[test]
590 fn test_content_node_search() {
591 let file_path = PathBuf::from("test.md");
592 let content_type = ContentType::Documentation {
593 format: DocumentFormat::Markdown,
594 };
595 let mut node = ContentNode::new(file_path.clone(), content_type.clone());
596
597 let chunk1 = ContentChunk::new(
599 file_path.clone(),
600 content_type.clone(),
601 "First test content".to_string(),
602 Span::new(0, 18, 1, 1, 1, 19),
603 0,
604 );
605 let chunk2 = ContentChunk::new(
606 file_path.clone(),
607 content_type.clone(),
608 "Second example content".to_string(),
609 Span::new(19, 41, 2, 2, 1, 23),
610 1,
611 );
612 node.add_chunk(chunk1);
613 node.add_chunk(chunk2);
614
615 let results = node.search("TEST", false);
617 assert_eq!(results.len(), 1, "Should find 'test' case-insensitively");
618
619 let results = node.search("content", false);
620 assert_eq!(results.len(), 2, "Should find 'content' in both chunks");
621
622 let results = node.search("TEST", true);
624 assert_eq!(results.len(), 0, "Should not find 'TEST' case-sensitively");
625
626 let results = node.search("First", true);
627 assert_eq!(results.len(), 1, "Should find exact case match");
628 }
629
630 #[test]
631 fn test_search_query_default() {
632 let query = SearchQuery::default();
633
634 assert_eq!(query.query, "");
635 assert_eq!(query.max_results, 100);
636 assert!(!query.case_sensitive);
637 assert!(!query.use_regex);
638 assert!(query.include_context);
639 assert_eq!(query.context_lines, 2);
640 assert_eq!(query.content_types.len(), 3);
641 }
642
643 #[test]
644 fn test_search_query_builder() {
645 let query = SearchQuery {
646 query: "test query".to_string(),
647 content_types: vec![ContentType::Code {
648 language: Language::Python,
649 }],
650 file_patterns: vec!["*.py".to_string()],
651 exclude_patterns: vec!["test_*.py".to_string()],
652 max_results: 25,
653 case_sensitive: true,
654 use_regex: true,
655 include_context: false,
656 context_lines: 5,
657 };
658
659 assert_eq!(query.query, "test query");
661 assert_eq!(query.max_results, 25);
662 assert!(query.case_sensitive);
663 assert!(query.use_regex);
664 assert!(!query.include_context);
665 assert_eq!(query.context_lines, 5);
666 assert_eq!(query.file_patterns, vec!["*.py"]);
667 assert_eq!(query.exclude_patterns, vec!["test_*.py"]);
668 }
669
670 #[test]
671 fn test_content_stats_creation() {
672 let mut stats = ContentStats::new();
673
674 assert_eq!(stats.total_files, 0);
675 assert_eq!(stats.total_chunks, 0);
676 assert_eq!(stats.total_tokens, 0);
677 assert!(stats.content_by_type.is_empty());
678 assert!(stats.size_distribution.is_empty());
679
680 stats.total_files = 10;
682 stats.total_chunks = 50;
683 stats.total_tokens = 1000;
684 stats.content_by_type.insert("code:python".to_string(), 15);
685 stats.size_distribution.insert("small".to_string(), 8);
686
687 assert_eq!(stats.total_files, 10);
688 assert_eq!(stats.total_chunks, 50);
689 assert_eq!(stats.total_tokens, 1000);
690 }
691
692 #[test]
693 fn test_search_result_structure() {
694 let file_path = PathBuf::from("test.md");
695 let chunk = ContentChunk::new(
696 file_path,
697 ContentType::Documentation {
698 format: DocumentFormat::Markdown,
699 },
700 "Test content with query match".to_string(),
701 Span::new(0, 29, 1, 1, 1, 30),
702 0,
703 );
704
705 let search_match = SearchMatch {
706 text: "query".to_string(),
707 position: 18,
708 line_number: 1,
709 column_number: 19,
710 context_before: Some("Test content with ".to_string()),
711 context_after: Some(" match".to_string()),
712 };
713
714 let result = SearchResult {
715 chunk: chunk.clone(),
716 score: 0.85,
717 matches: vec![search_match.clone()],
718 related_nodes: vec![],
719 };
720
721 assert_eq!(result.score, 0.85);
722 assert_eq!(result.matches.len(), 1);
723 assert_eq!(result.matches[0].text, "query");
724 assert_eq!(result.matches[0].position, 18);
725 assert_eq!(result.chunk.content, chunk.content);
726 }
727
728 #[test]
729 fn test_comment_context_variants() {
730 let contexts = vec![
731 CommentContext::Function {
732 function_name: "test_func".to_string(),
733 },
734 CommentContext::Class {
735 class_name: "TestClass".to_string(),
736 },
737 CommentContext::Module,
738 CommentContext::Inline,
739 CommentContext::Block,
740 CommentContext::Documentation,
741 ];
742
743 for context in contexts {
744 let content_type = ContentType::Comment {
745 language: Language::Python,
746 context: context.clone(),
747 };
748
749 let json = serde_json::to_string(&content_type).unwrap();
751 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
752
753 if let ContentType::Comment {
754 context: deserialized_context,
755 ..
756 } = deserialized
757 {
758 assert_eq!(
759 std::mem::discriminant(&context),
760 std::mem::discriminant(&deserialized_context),
761 "Context variant should match after serialization"
762 );
763 } else {
764 panic!("Expected Comment content type");
765 }
766 }
767 }
768
769 #[test]
770 fn test_document_format_variants() {
771 let formats = vec![
772 DocumentFormat::Markdown,
773 DocumentFormat::RestructuredText,
774 DocumentFormat::AsciiDoc,
775 DocumentFormat::PlainText,
776 DocumentFormat::Html,
777 ];
778
779 for format in formats {
780 let content_type = ContentType::Documentation {
781 format: format.clone(),
782 };
783
784 let json = serde_json::to_string(&content_type).unwrap();
786 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
787
788 if let ContentType::Documentation {
789 format: deserialized_format,
790 } = deserialized
791 {
792 assert_eq!(
793 format, deserialized_format,
794 "Format should match after serialization"
795 );
796 } else {
797 panic!("Expected Documentation content type");
798 }
799 }
800 }
801
802 #[test]
803 fn test_config_format_variants() {
804 let formats = vec![
805 ConfigFormat::Json,
806 ConfigFormat::Yaml,
807 ConfigFormat::Toml,
808 ConfigFormat::Ini,
809 ConfigFormat::Properties,
810 ConfigFormat::Env,
811 ConfigFormat::Xml,
812 ];
813
814 for format in formats {
815 let content_type = ContentType::Configuration {
816 format: format.clone(),
817 };
818
819 let json = serde_json::to_string(&content_type).unwrap();
821 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
822
823 if let ContentType::Configuration {
824 format: deserialized_format,
825 } = deserialized
826 {
827 assert_eq!(
828 format, deserialized_format,
829 "Format should match after serialization"
830 );
831 } else {
832 panic!("Expected Configuration content type");
833 }
834 }
835 }
836
837 #[test]
838 fn test_content_update_kinds() {
839 let file_path = PathBuf::from("test.md");
840 let old_path = PathBuf::from("old_test.md");
841
842 let updates = vec![
843 ContentUpdate {
844 file_path: file_path.clone(),
845 update_kind: ContentUpdateKind::Created,
846 timestamp: SystemTime::now(),
847 },
848 ContentUpdate {
849 file_path: file_path.clone(),
850 update_kind: ContentUpdateKind::Modified,
851 timestamp: SystemTime::now(),
852 },
853 ContentUpdate {
854 file_path: file_path.clone(),
855 update_kind: ContentUpdateKind::Deleted,
856 timestamp: SystemTime::now(),
857 },
858 ContentUpdate {
859 file_path: file_path.clone(),
860 update_kind: ContentUpdateKind::Renamed {
861 old_path: old_path.clone(),
862 },
863 timestamp: SystemTime::now(),
864 },
865 ];
866
867 for update in updates {
868 assert_eq!(update.file_path, file_path);
870 assert!(update.timestamp <= SystemTime::now());
871
872 match &update.update_kind {
874 ContentUpdateKind::Created => { }
875 ContentUpdateKind::Modified => { }
876 ContentUpdateKind::Deleted => { }
877 ContentUpdateKind::Renamed {
878 old_path: renamed_old_path,
879 } => {
880 assert_eq!(renamed_old_path, &old_path);
881 }
882 }
883 }
884 }
885}