1use crate::ast::{Language, NodeId, Span};
8use blake3::Hasher;
9use regex::Regex;
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12use std::path::{Path, PathBuf};
13use std::time::SystemTime;
14
15pub mod extractors;
16pub mod index;
17pub mod parsers;
18pub mod search;
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
22pub struct ChunkId([u8; 16]);
23
24impl ChunkId {
25 pub fn new(file_path: &Path, chunk_index: usize, content_hash: &[u8; 32]) -> Self {
27 let mut hasher = Hasher::new();
28 hasher.update(file_path.to_string_lossy().as_bytes());
29 hasher.update(&chunk_index.to_le_bytes());
30 hasher.update(content_hash);
31
32 let hash = hasher.finalize();
33 let mut id = [0u8; 16];
34 id.copy_from_slice(&hash.as_bytes()[..16]);
35 Self(id)
36 }
37
38 pub fn to_hex(&self) -> String {
40 hex::encode(self.0)
41 }
42}
43
44#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum ContentType {
48 Code { language: Language },
50 Documentation { format: DocumentFormat },
52 Configuration { format: ConfigFormat },
54 Comment {
56 language: Language,
57 context: CommentContext,
58 },
59 PlainText,
61}
62
63#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
65#[serde(rename_all = "snake_case")]
66pub enum DocumentFormat {
67 Markdown,
68 RestructuredText,
69 AsciiDoc,
70 PlainText,
71 Html,
72}
73
74#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
76#[serde(rename_all = "snake_case")]
77pub enum ConfigFormat {
78 Json,
79 Yaml,
80 Toml,
81 Ini,
82 Properties,
83 Env,
84 Xml,
85}
86
87#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
89#[serde(rename_all = "snake_case")]
90pub enum CommentContext {
91 Function { function_name: String },
93 Class { class_name: String },
95 Module,
97 Inline,
99 Block,
101 Documentation,
103}
104
105#[derive(Debug, Clone, Serialize, Deserialize)]
107pub struct ContentChunk {
108 pub id: ChunkId,
110 pub content_type: ContentType,
112 pub content: String,
114 pub span: Span,
116 pub file_path: PathBuf,
118 pub tokens: Vec<String>,
120 pub related_nodes: Vec<NodeId>,
122 pub last_modified: SystemTime,
124 pub metadata: serde_json::Value,
126}
127
128impl ContentChunk {
129 pub fn new(
131 file_path: PathBuf,
132 content_type: ContentType,
133 content: String,
134 span: Span,
135 chunk_index: usize,
136 ) -> Self {
137 let content_bytes = blake3::hash(content.as_bytes());
138 let id = ChunkId::new(&file_path, chunk_index, content_bytes.as_bytes());
139
140 Self {
141 id,
142 content_type,
143 content: content.clone(),
144 span,
145 file_path,
146 tokens: Self::tokenize_content(&content),
147 related_nodes: Vec::new(),
148 last_modified: SystemTime::now(),
149 metadata: serde_json::Value::Null,
150 }
151 }
152
153 fn tokenize_content(content: &str) -> Vec<String> {
155 let re = Regex::new(r"[^\w]+").unwrap();
157 re.split(content)
158 .filter(|s| !s.is_empty() && s.len() > 1) .map(|s| s.to_lowercase())
160 .collect()
161 }
162
163 pub fn add_related_node(&mut self, node_id: NodeId) {
165 if !self.related_nodes.contains(&node_id) {
166 self.related_nodes.push(node_id);
167 }
168 }
169
170 pub fn with_metadata(mut self, metadata: serde_json::Value) -> Self {
172 self.metadata = metadata;
173 self
174 }
175}
176
177#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct ContentNode {
180 pub file_path: PathBuf,
182 pub content_type: ContentType,
184 pub chunks: Vec<ContentChunk>,
186 pub ast_nodes: Vec<NodeId>,
188 pub last_indexed: SystemTime,
190 pub file_size: usize,
192 pub is_monitored: bool,
194}
195
196impl ContentNode {
197 pub fn new(file_path: PathBuf, content_type: ContentType) -> Self {
199 Self {
200 file_path,
201 content_type,
202 chunks: Vec::new(),
203 ast_nodes: Vec::new(),
204 last_indexed: SystemTime::now(),
205 file_size: 0,
206 is_monitored: true,
207 }
208 }
209
210 pub fn add_chunk(&mut self, chunk: ContentChunk) {
212 self.chunks.push(chunk);
213 }
214
215 pub fn add_ast_node(&mut self, node_id: NodeId) {
217 if !self.ast_nodes.contains(&node_id) {
218 self.ast_nodes.push(node_id);
219 }
220 }
221
222 pub fn get_all_tokens(&self) -> Vec<String> {
224 let mut all_tokens = Vec::new();
225 for chunk in &self.chunks {
226 all_tokens.extend(chunk.tokens.clone());
227 }
228 all_tokens.sort();
229 all_tokens.dedup();
230 all_tokens
231 }
232
233 pub fn search(&self, query: &str, case_sensitive: bool) -> Vec<&ContentChunk> {
235 let search_query = if case_sensitive {
236 query.to_string()
237 } else {
238 query.to_lowercase()
239 };
240
241 self.chunks
242 .iter()
243 .filter(|chunk| {
244 let content = if case_sensitive {
245 &chunk.content
246 } else {
247 &chunk.content.to_lowercase()
248 };
249 content.contains(&search_query)
250 })
251 .collect()
252 }
253}
254
255#[derive(Debug, Clone, Serialize, Deserialize)]
257pub struct ContentStats {
258 pub total_files: usize,
260 pub total_chunks: usize,
262 pub total_tokens: usize,
264 pub content_by_type: HashMap<String, usize>,
266 pub size_distribution: HashMap<String, usize>,
268 pub computed_at: SystemTime,
270}
271
272impl ContentStats {
273 pub fn new() -> Self {
275 Self {
276 total_files: 0,
277 total_chunks: 0,
278 total_tokens: 0,
279 content_by_type: HashMap::new(),
280 size_distribution: HashMap::new(),
281 computed_at: SystemTime::now(),
282 }
283 }
284}
285
286impl Default for ContentStats {
287 fn default() -> Self {
288 Self::new()
289 }
290}
291
292#[derive(Debug, Clone, Serialize, Deserialize)]
294pub struct SearchQuery {
295 pub query: String,
297 pub content_types: Vec<ContentType>,
299 pub file_patterns: Vec<String>,
301 pub exclude_patterns: Vec<String>,
303 pub max_results: usize,
305 pub case_sensitive: bool,
307 pub use_regex: bool,
309 pub include_context: bool,
311 pub context_lines: usize,
313}
314
315impl Default for SearchQuery {
316 fn default() -> Self {
317 Self {
318 query: String::new(),
319 content_types: vec![
320 ContentType::Code {
321 language: Language::Unknown,
322 },
323 ContentType::Documentation {
324 format: DocumentFormat::Markdown,
325 },
326 ContentType::Comment {
327 language: Language::Unknown,
328 context: CommentContext::Block,
329 },
330 ],
331 file_patterns: Vec::new(),
332 exclude_patterns: Vec::new(),
333 max_results: 100,
334 case_sensitive: false,
335 use_regex: false,
336 include_context: true,
337 context_lines: 2,
338 }
339 }
340}
341
342#[derive(Debug, Clone, Serialize, Deserialize)]
344pub struct SearchResult {
345 pub chunk: ContentChunk,
347 pub score: f32,
349 pub matches: Vec<SearchMatch>,
351 pub related_nodes: Vec<NodeId>,
353}
354
355#[derive(Debug, Clone, Serialize, Deserialize)]
357pub struct SearchMatch {
358 pub text: String,
360 pub position: usize,
362 pub line_number: usize,
364 pub column_number: usize,
366 pub context_before: Option<String>,
368 pub context_after: Option<String>,
370}
371
372#[derive(Debug, Clone)]
374pub struct ContentUpdate {
375 pub file_path: PathBuf,
377 pub update_kind: ContentUpdateKind,
379 pub timestamp: SystemTime,
381}
382
383#[derive(Debug, Clone)]
385pub enum ContentUpdateKind {
386 Created,
388 Modified,
390 Deleted,
392 Renamed { old_path: PathBuf },
394}
395
396#[cfg(test)]
397mod tests {
398 use super::*;
399 use crate::ast::NodeKind;
400
401 #[test]
402 fn test_chunk_id_generation() {
403 let file_path = PathBuf::from("test.md");
404 let content_hash = [0u8; 32];
405
406 let id1 = ChunkId::new(&file_path, 0, &content_hash);
407 let id2 = ChunkId::new(&file_path, 0, &content_hash);
408 let id3 = ChunkId::new(&file_path, 1, &content_hash);
409
410 assert_eq!(id1, id2, "Same inputs should generate same ID");
411 assert_ne!(
412 id1, id3,
413 "Different chunk index should generate different ID"
414 );
415
416 let hex = id1.to_hex();
417 assert_eq!(hex.len(), 32, "Hex string should be 32 characters");
418 assert!(
419 hex.chars().all(|c| c.is_ascii_hexdigit()),
420 "Should be valid hex"
421 );
422 }
423
424 #[test]
425 fn test_content_types_serialization() {
426 let test_cases = vec![
427 ContentType::Code {
428 language: Language::Python,
429 },
430 ContentType::Documentation {
431 format: DocumentFormat::Markdown,
432 },
433 ContentType::Configuration {
434 format: ConfigFormat::Json,
435 },
436 ContentType::Comment {
437 language: Language::JavaScript,
438 context: CommentContext::Function {
439 function_name: "test".to_string(),
440 },
441 },
442 ContentType::PlainText,
443 ];
444
445 for content_type in test_cases {
446 let json = serde_json::to_string(&content_type).unwrap();
447 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
448 assert_eq!(
449 std::mem::discriminant(&content_type),
450 std::mem::discriminant(&deserialized),
451 "Serialization roundtrip failed for: {:?}",
452 content_type
453 );
454 }
455 }
456
457 #[test]
458 fn test_content_chunk_creation() {
459 let file_path = PathBuf::from("test.md");
460 let content_type = ContentType::Documentation {
461 format: DocumentFormat::Markdown,
462 };
463 let content = "# Test Header\nSome content here.".to_string();
464 let span = Span::new(0, content.len(), 1, 2, 1, 19);
465
466 let chunk = ContentChunk::new(
467 file_path.clone(),
468 content_type.clone(),
469 content.clone(),
470 span,
471 0,
472 );
473
474 assert_eq!(chunk.file_path, file_path);
475 assert_eq!(chunk.content, content);
476 assert!(
477 !chunk.tokens.is_empty(),
478 "Should extract tokens from content"
479 );
480 assert!(
481 chunk.tokens.contains(&"test".to_string()),
482 "Should extract 'test' token"
483 );
484 assert!(
485 chunk.tokens.contains(&"header".to_string()),
486 "Should extract 'header' token"
487 );
488 assert!(
489 chunk.tokens.contains(&"content".to_string()),
490 "Should extract 'content' token"
491 );
492 }
493
494 #[test]
495 fn test_content_chunk_tokenization() {
496 let file_path = PathBuf::from("test.py");
497 let content_type = ContentType::Code {
498 language: Language::Python,
499 };
500 let content = "def hello_world():\n print('Hello, World!')".to_string();
501 let span = Span::new(0, content.len(), 1, 2, 1, 26);
502
503 let chunk = ContentChunk::new(file_path, content_type, content, span, 0);
504
505 assert!(chunk.tokens.contains(&"def".to_string()));
506 assert!(chunk.tokens.contains(&"hello".to_string()));
507 assert!(chunk.tokens.contains(&"world".to_string()));
508 assert!(chunk.tokens.contains(&"print".to_string()));
509 assert!(
510 !chunk.tokens.contains(&"(".to_string()),
511 "Should filter out single chars"
512 );
513 }
514
515 #[test]
516 fn test_content_node_operations() {
517 let file_path = PathBuf::from("test.md");
518 let content_type = ContentType::Documentation {
519 format: DocumentFormat::Markdown,
520 };
521 let mut node = ContentNode::new(file_path.clone(), content_type.clone());
522
523 assert_eq!(node.file_path, file_path);
524 assert_eq!(node.chunks.len(), 0);
525 assert_eq!(node.ast_nodes.len(), 0);
526
527 let chunk = ContentChunk::new(
529 file_path.clone(),
530 content_type,
531 "Test content".to_string(),
532 Span::new(0, 12, 1, 1, 1, 13),
533 0,
534 );
535 node.add_chunk(chunk);
536
537 assert_eq!(node.chunks.len(), 1);
538
539 let node_id = NodeId::new(
541 "test",
542 &file_path,
543 &Span::new(0, 5, 1, 1, 1, 6),
544 &NodeKind::Function,
545 );
546 node.add_ast_node(node_id);
547
548 assert_eq!(node.ast_nodes.len(), 1);
549 assert_eq!(node.ast_nodes[0], node_id);
550
551 let tokens = node.get_all_tokens();
553 assert!(tokens.contains(&"test".to_string()));
554 assert!(tokens.contains(&"content".to_string()));
555 }
556
557 #[test]
558 fn test_content_node_search() {
559 let file_path = PathBuf::from("test.md");
560 let content_type = ContentType::Documentation {
561 format: DocumentFormat::Markdown,
562 };
563 let mut node = ContentNode::new(file_path.clone(), content_type.clone());
564
565 let chunk1 = ContentChunk::new(
567 file_path.clone(),
568 content_type.clone(),
569 "First test content".to_string(),
570 Span::new(0, 18, 1, 1, 1, 19),
571 0,
572 );
573 let chunk2 = ContentChunk::new(
574 file_path.clone(),
575 content_type.clone(),
576 "Second example content".to_string(),
577 Span::new(19, 41, 2, 2, 1, 23),
578 1,
579 );
580 node.add_chunk(chunk1);
581 node.add_chunk(chunk2);
582
583 let results = node.search("TEST", false);
585 assert_eq!(results.len(), 1, "Should find 'test' case-insensitively");
586
587 let results = node.search("content", false);
588 assert_eq!(results.len(), 2, "Should find 'content' in both chunks");
589
590 let results = node.search("TEST", true);
592 assert_eq!(results.len(), 0, "Should not find 'TEST' case-sensitively");
593
594 let results = node.search("First", true);
595 assert_eq!(results.len(), 1, "Should find exact case match");
596 }
597
598 #[test]
599 fn test_search_query_default() {
600 let query = SearchQuery::default();
601
602 assert_eq!(query.query, "");
603 assert_eq!(query.max_results, 100);
604 assert!(!query.case_sensitive);
605 assert!(!query.use_regex);
606 assert!(query.include_context);
607 assert_eq!(query.context_lines, 2);
608 assert_eq!(query.content_types.len(), 3);
609 }
610
611 #[test]
612 fn test_search_query_builder() {
613 let query = SearchQuery {
614 query: "test query".to_string(),
615 content_types: vec![ContentType::Code {
616 language: Language::Python,
617 }],
618 file_patterns: vec!["*.py".to_string()],
619 exclude_patterns: vec!["test_*.py".to_string()],
620 max_results: 25,
621 case_sensitive: true,
622 use_regex: true,
623 include_context: false,
624 context_lines: 5,
625 };
626
627 assert_eq!(query.query, "test query");
629 assert_eq!(query.max_results, 25);
630 assert!(query.case_sensitive);
631 assert!(query.use_regex);
632 assert!(!query.include_context);
633 assert_eq!(query.context_lines, 5);
634 assert_eq!(query.file_patterns, vec!["*.py"]);
635 assert_eq!(query.exclude_patterns, vec!["test_*.py"]);
636 }
637
638 #[test]
639 fn test_content_stats_creation() {
640 let mut stats = ContentStats::new();
641
642 assert_eq!(stats.total_files, 0);
643 assert_eq!(stats.total_chunks, 0);
644 assert_eq!(stats.total_tokens, 0);
645 assert!(stats.content_by_type.is_empty());
646 assert!(stats.size_distribution.is_empty());
647
648 stats.total_files = 10;
650 stats.total_chunks = 50;
651 stats.total_tokens = 1000;
652 stats.content_by_type.insert("code:python".to_string(), 15);
653 stats.size_distribution.insert("small".to_string(), 8);
654
655 assert_eq!(stats.total_files, 10);
656 assert_eq!(stats.total_chunks, 50);
657 assert_eq!(stats.total_tokens, 1000);
658 }
659
660 #[test]
661 fn test_search_result_structure() {
662 let file_path = PathBuf::from("test.md");
663 let chunk = ContentChunk::new(
664 file_path,
665 ContentType::Documentation {
666 format: DocumentFormat::Markdown,
667 },
668 "Test content with query match".to_string(),
669 Span::new(0, 29, 1, 1, 1, 30),
670 0,
671 );
672
673 let search_match = SearchMatch {
674 text: "query".to_string(),
675 position: 18,
676 line_number: 1,
677 column_number: 19,
678 context_before: Some("Test content with ".to_string()),
679 context_after: Some(" match".to_string()),
680 };
681
682 let result = SearchResult {
683 chunk: chunk.clone(),
684 score: 0.85,
685 matches: vec![search_match.clone()],
686 related_nodes: vec![],
687 };
688
689 assert_eq!(result.score, 0.85);
690 assert_eq!(result.matches.len(), 1);
691 assert_eq!(result.matches[0].text, "query");
692 assert_eq!(result.matches[0].position, 18);
693 assert_eq!(result.chunk.content, chunk.content);
694 }
695
696 #[test]
697 fn test_comment_context_variants() {
698 let contexts = vec![
699 CommentContext::Function {
700 function_name: "test_func".to_string(),
701 },
702 CommentContext::Class {
703 class_name: "TestClass".to_string(),
704 },
705 CommentContext::Module,
706 CommentContext::Inline,
707 CommentContext::Block,
708 CommentContext::Documentation,
709 ];
710
711 for context in contexts {
712 let content_type = ContentType::Comment {
713 language: Language::Python,
714 context: context.clone(),
715 };
716
717 let json = serde_json::to_string(&content_type).unwrap();
719 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
720
721 if let ContentType::Comment {
722 context: deserialized_context,
723 ..
724 } = deserialized
725 {
726 assert_eq!(
727 std::mem::discriminant(&context),
728 std::mem::discriminant(&deserialized_context),
729 "Context variant should match after serialization"
730 );
731 } else {
732 panic!("Expected Comment content type");
733 }
734 }
735 }
736
737 #[test]
738 fn test_document_format_variants() {
739 let formats = vec![
740 DocumentFormat::Markdown,
741 DocumentFormat::RestructuredText,
742 DocumentFormat::AsciiDoc,
743 DocumentFormat::PlainText,
744 DocumentFormat::Html,
745 ];
746
747 for format in formats {
748 let content_type = ContentType::Documentation {
749 format: format.clone(),
750 };
751
752 let json = serde_json::to_string(&content_type).unwrap();
754 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
755
756 if let ContentType::Documentation {
757 format: deserialized_format,
758 } = deserialized
759 {
760 assert_eq!(
761 format, deserialized_format,
762 "Format should match after serialization"
763 );
764 } else {
765 panic!("Expected Documentation content type");
766 }
767 }
768 }
769
770 #[test]
771 fn test_config_format_variants() {
772 let formats = vec![
773 ConfigFormat::Json,
774 ConfigFormat::Yaml,
775 ConfigFormat::Toml,
776 ConfigFormat::Ini,
777 ConfigFormat::Properties,
778 ConfigFormat::Env,
779 ConfigFormat::Xml,
780 ];
781
782 for format in formats {
783 let content_type = ContentType::Configuration {
784 format: format.clone(),
785 };
786
787 let json = serde_json::to_string(&content_type).unwrap();
789 let deserialized: ContentType = serde_json::from_str(&json).unwrap();
790
791 if let ContentType::Configuration {
792 format: deserialized_format,
793 } = deserialized
794 {
795 assert_eq!(
796 format, deserialized_format,
797 "Format should match after serialization"
798 );
799 } else {
800 panic!("Expected Configuration content type");
801 }
802 }
803 }
804
805 #[test]
806 fn test_content_update_kinds() {
807 let file_path = PathBuf::from("test.md");
808 let old_path = PathBuf::from("old_test.md");
809
810 let updates = vec![
811 ContentUpdate {
812 file_path: file_path.clone(),
813 update_kind: ContentUpdateKind::Created,
814 timestamp: SystemTime::now(),
815 },
816 ContentUpdate {
817 file_path: file_path.clone(),
818 update_kind: ContentUpdateKind::Modified,
819 timestamp: SystemTime::now(),
820 },
821 ContentUpdate {
822 file_path: file_path.clone(),
823 update_kind: ContentUpdateKind::Deleted,
824 timestamp: SystemTime::now(),
825 },
826 ContentUpdate {
827 file_path: file_path.clone(),
828 update_kind: ContentUpdateKind::Renamed {
829 old_path: old_path.clone(),
830 },
831 timestamp: SystemTime::now(),
832 },
833 ];
834
835 for update in updates {
836 assert_eq!(update.file_path, file_path);
838 assert!(update.timestamp <= SystemTime::now());
839
840 match &update.update_kind {
842 ContentUpdateKind::Created => assert!(true),
843 ContentUpdateKind::Modified => assert!(true),
844 ContentUpdateKind::Deleted => assert!(true),
845 ContentUpdateKind::Renamed {
846 old_path: renamed_old_path,
847 } => {
848 assert_eq!(renamed_old_path, &old_path);
849 }
850 }
851 }
852 }
853}