1use super::{ConfigFormat, ContentChunk, ContentNode, ContentType, DocumentFormat};
7use crate::ast::Span;
8use anyhow::{anyhow, Result};
9use regex::Regex;
10use serde_json::Value;
11use std::path::Path;
12
13pub struct DocumentParser {
15 markdown_parser: MarkdownParser,
17 config_parser: ConfigParser,
19 text_parser: TextParser,
21}
22
23impl DocumentParser {
24 pub fn new() -> Self {
26 Self {
27 markdown_parser: MarkdownParser::new(),
28 config_parser: ConfigParser::new(),
29 text_parser: TextParser::new(),
30 }
31 }
32
33 pub fn parse_file(&self, file_path: &Path, content: &str) -> Result<ContentNode> {
35 let content_type = self.detect_content_type(file_path)?;
36 let mut node = ContentNode::new(file_path.to_path_buf(), content_type.clone());
37
38 let chunks = match content_type {
39 ContentType::Documentation { format } => match format {
40 DocumentFormat::Markdown => self.markdown_parser.parse(file_path, content)?,
41 DocumentFormat::PlainText
42 | DocumentFormat::RestructuredText
43 | DocumentFormat::AsciiDoc
44 | DocumentFormat::Html => self.text_parser.parse(file_path, content, format)?,
45 },
46 ContentType::Configuration { format } => {
47 self.config_parser.parse(file_path, content, format)?
48 }
49 ContentType::PlainText => {
50 self.text_parser
51 .parse(file_path, content, DocumentFormat::PlainText)?
52 }
53 _ => return Err(anyhow!("Unsupported content type for document parser")),
54 };
55
56 for chunk in chunks {
57 node.add_chunk(chunk);
58 }
59 node.file_size = content.len();
60
61 Ok(node)
62 }
63
64 fn detect_content_type(&self, file_path: &Path) -> Result<ContentType> {
66 if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
68 if file_name == ".env" {
69 return Ok(ContentType::Configuration {
70 format: ConfigFormat::Env,
71 });
72 }
73 }
74
75 let extension = file_path
76 .extension()
77 .and_then(|ext| ext.to_str())
78 .unwrap_or("")
79 .to_lowercase();
80
81 match extension.as_str() {
82 "md" | "markdown" => Ok(ContentType::Documentation {
83 format: DocumentFormat::Markdown,
84 }),
85 "rst" => Ok(ContentType::Documentation {
86 format: DocumentFormat::RestructuredText,
87 }),
88 "adoc" | "asciidoc" => Ok(ContentType::Documentation {
89 format: DocumentFormat::AsciiDoc,
90 }),
91 "html" | "htm" => Ok(ContentType::Documentation {
92 format: DocumentFormat::Html,
93 }),
94 "txt" | "text" => Ok(ContentType::Documentation {
95 format: DocumentFormat::PlainText,
96 }),
97 "json" => Ok(ContentType::Configuration {
98 format: ConfigFormat::Json,
99 }),
100 "yaml" | "yml" => Ok(ContentType::Configuration {
101 format: ConfigFormat::Yaml,
102 }),
103 "toml" => Ok(ContentType::Configuration {
104 format: ConfigFormat::Toml,
105 }),
106 "ini" => Ok(ContentType::Configuration {
107 format: ConfigFormat::Ini,
108 }),
109 "properties" => Ok(ContentType::Configuration {
110 format: ConfigFormat::Properties,
111 }),
112 "env" => Ok(ContentType::Configuration {
113 format: ConfigFormat::Env,
114 }),
115 "xml" => Ok(ContentType::Configuration {
116 format: ConfigFormat::Xml,
117 }),
118 _ => Ok(ContentType::PlainText),
119 }
120 }
121}
122
123impl Default for DocumentParser {
124 fn default() -> Self {
125 Self::new()
126 }
127}
128
129pub struct MarkdownParser {
131 header_regex: Regex,
133 code_block_regex: Regex,
135 #[allow(dead_code)]
137 inline_code_regex: Regex,
138 #[allow(dead_code)]
140 link_regex: Regex,
141 #[allow(dead_code)]
143 list_regex: Regex,
144}
145
146impl MarkdownParser {
147 pub fn new() -> Self {
149 Self {
150 header_regex: Regex::new(r"(?m)^(#{1,6})\s+(.+)$").unwrap(),
151 code_block_regex: Regex::new(r"```(\w+)?\n([\s\S]*?)\n```").unwrap(),
152 inline_code_regex: Regex::new(r"`([^`]+)`").unwrap(),
153 link_regex: Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap(),
154 list_regex: Regex::new(r"(?m)^[\s]*[-*+]\s+(.+)$").unwrap(),
155 }
156 }
157
158 pub fn parse(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
160 let mut chunks = Vec::new();
161 let lines: Vec<&str> = content.lines().collect();
162 let mut _current_line = 0;
163 let mut chunk_index = 0;
164
165 for (line_idx, line) in lines.iter().enumerate() {
167 if let Some(captures) = self.header_regex.captures(line) {
168 let level = captures.get(1).unwrap().as_str().len();
169 let header_text = captures.get(2).unwrap().as_str();
170
171 let span = self.calculate_line_span(line_idx, line, content);
172 let chunk = ContentChunk::new(
173 file_path.to_path_buf(),
174 ContentType::Documentation {
175 format: DocumentFormat::Markdown,
176 },
177 header_text.to_string(),
178 span,
179 chunk_index,
180 )
181 .with_metadata(serde_json::json!({
182 "header_level": level,
183 "element_type": "header"
184 }));
185
186 chunks.push(chunk);
187 chunk_index += 1;
188 }
189 }
190
191 for captures in self.code_block_regex.captures_iter(content) {
193 let language = captures.get(1).map(|m| m.as_str()).unwrap_or("text");
194 let code_content = captures.get(2).unwrap().as_str();
195 let full_match = captures.get(0).unwrap();
196
197 let span = self.calculate_match_span(&full_match, content);
198 let chunk = ContentChunk::new(
199 file_path.to_path_buf(),
200 ContentType::Documentation {
201 format: DocumentFormat::Markdown,
202 },
203 code_content.to_string(),
204 span,
205 chunk_index,
206 )
207 .with_metadata(serde_json::json!({
208 "language": language,
209 "element_type": "code_block"
210 }));
211
212 chunks.push(chunk);
213 chunk_index += 1;
214 }
215
216 let mut paragraph_start = 0;
218 let mut in_paragraph = false;
219 let mut paragraph_lines = Vec::new();
220
221 for (line_idx, line) in lines.iter().enumerate() {
222 let line_trimmed = line.trim();
223
224 if self.header_regex.is_match(line)
226 || line_trimmed.starts_with("```")
227 || line_trimmed.is_empty()
228 {
229 if in_paragraph && !paragraph_lines.is_empty() {
231 let paragraph_text = paragraph_lines.join("\n");
232 let span =
233 self.calculate_paragraph_span(paragraph_start, line_idx - 1, content);
234
235 let chunk = ContentChunk::new(
236 file_path.to_path_buf(),
237 ContentType::Documentation {
238 format: DocumentFormat::Markdown,
239 },
240 paragraph_text,
241 span,
242 chunk_index,
243 )
244 .with_metadata(serde_json::json!({
245 "element_type": "paragraph"
246 }));
247
248 chunks.push(chunk);
249 chunk_index += 1;
250 }
251
252 in_paragraph = false;
253 paragraph_lines.clear();
254 continue;
255 }
256
257 if !in_paragraph {
259 in_paragraph = true;
260 paragraph_start = line_idx;
261 }
262 paragraph_lines.push(line_trimmed);
263 }
264
265 if in_paragraph && !paragraph_lines.is_empty() {
267 let paragraph_text = paragraph_lines.join("\n");
268 let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, content);
269
270 let chunk = ContentChunk::new(
271 file_path.to_path_buf(),
272 ContentType::Documentation {
273 format: DocumentFormat::Markdown,
274 },
275 paragraph_text,
276 span,
277 chunk_index,
278 )
279 .with_metadata(serde_json::json!({
280 "element_type": "paragraph"
281 }));
282
283 chunks.push(chunk);
284 }
285
286 Ok(chunks)
287 }
288
289 fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
291 let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
292 let start_byte = lines_before;
293 let end_byte = start_byte + line.len();
294
295 Span::new(
296 start_byte,
297 end_byte,
298 line_idx + 1,
299 line_idx + 1,
300 1,
301 line.len() + 1,
302 )
303 }
304
305 fn calculate_match_span(&self, match_obj: ®ex::Match, content: &str) -> Span {
307 let start_byte = match_obj.start();
308 let end_byte = match_obj.end();
309
310 let content_before = &content[..start_byte];
312 let start_line = content_before.lines().count();
313 let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
314
315 let match_content = match_obj.as_str();
317 let lines_in_match = match_content.lines().count();
318 let end_line = start_line + lines_in_match.saturating_sub(1);
319 let end_column = if lines_in_match > 1 {
320 match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
321 } else {
322 start_column + match_content.len()
323 };
324
325 Span::new(
326 start_byte,
327 end_byte,
328 start_line.max(1),
329 end_line.max(1),
330 start_column,
331 end_column,
332 )
333 }
334
335 fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, content: &str) -> Span {
337 let lines: Vec<&str> = content.lines().collect();
338 let start_byte: usize = lines
339 .iter()
340 .take(start_line)
341 .map(|l| l.len() + 1)
342 .sum::<usize>();
343 let end_byte: usize = lines
344 .iter()
345 .take(end_line + 1)
346 .map(|l| l.len() + 1)
347 .sum::<usize>()
348 - 1;
349
350 Span::new(
351 start_byte,
352 end_byte,
353 start_line + 1,
354 end_line + 1,
355 1,
356 lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
357 )
358 }
359}
360
361impl Default for MarkdownParser {
362 fn default() -> Self {
363 Self::new()
364 }
365}
366
367pub struct ConfigParser;
369
370impl ConfigParser {
371 pub fn new() -> Self {
373 Self
374 }
375
376 pub fn parse(
378 &self,
379 file_path: &Path,
380 content: &str,
381 format: ConfigFormat,
382 ) -> Result<Vec<ContentChunk>> {
383 match format {
384 ConfigFormat::Json => self.parse_json(file_path, content),
385 ConfigFormat::Yaml => self.parse_yaml(file_path, content),
386 ConfigFormat::Toml => self.parse_toml(file_path, content),
387 ConfigFormat::Ini => self.parse_ini(file_path, content),
388 ConfigFormat::Properties => self.parse_properties(file_path, content),
389 ConfigFormat::Env => self.parse_env(file_path, content),
390 ConfigFormat::Xml => self.parse_xml(file_path, content),
391 }
392 }
393
394 fn parse_json(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
396 let mut chunks = Vec::new();
397
398 match serde_json::from_str::<Value>(content) {
400 Ok(value) => {
401 self.extract_json_values(&value, file_path, content, &mut chunks, 0, "");
403 }
404 Err(_) => {
405 chunks.push(
407 ContentChunk::new(
408 file_path.to_path_buf(),
409 ContentType::Configuration {
410 format: ConfigFormat::Json,
411 },
412 content.to_string(),
413 Span::new(
414 0,
415 content.len(),
416 1,
417 content.lines().count(),
418 1,
419 content.lines().last().map(|l| l.len()).unwrap_or(0),
420 ),
421 0,
422 )
423 .with_metadata(serde_json::json!({
424 "parse_error": true,
425 "config_type": "json"
426 })),
427 );
428 }
429 }
430
431 Ok(chunks)
432 }
433
434 #[allow(clippy::only_used_in_recursion)]
436 fn extract_json_values(
437 &self,
438 value: &Value,
439 file_path: &Path,
440 content: &str,
441 chunks: &mut Vec<ContentChunk>,
442 chunk_index: usize,
443 key_path: &str,
444 ) {
445 match value {
446 Value::Object(map) => {
447 for (key, val) in map {
448 let new_path = if key_path.is_empty() {
449 key.clone()
450 } else {
451 format!("{}.{}", key_path, key)
452 };
453 self.extract_json_values(
454 val,
455 file_path,
456 content,
457 chunks,
458 chunks.len(),
459 &new_path,
460 );
461 }
462 }
463 Value::Array(arr) => {
464 for (index, val) in arr.iter().enumerate() {
465 let new_path = format!("{}[{}]", key_path, index);
466 self.extract_json_values(
467 val,
468 file_path,
469 content,
470 chunks,
471 chunks.len(),
472 &new_path,
473 );
474 }
475 }
476 Value::String(_) | Value::Number(_) | Value::Bool(_) => {
477 let value_str = match value {
479 Value::String(s) => s.clone(),
480 _ => value.to_string(),
481 };
482
483 let searchable_content = if key_path.is_empty() {
485 value_str.clone()
486 } else {
487 format!("{}: {}", key_path, value_str)
488 };
489
490 if let Some(position) = content.find(&value_str) {
492 let lines_before = content[..position].lines().count();
493 let line_start = content[..position].rfind('\n').map(|i| i + 1).unwrap_or(0);
494 let column = position - line_start + 1;
495
496 let span = Span::new(
497 position,
498 position + value_str.len(),
499 lines_before.max(1),
500 lines_before.max(1),
501 column,
502 column + value_str.len(),
503 );
504
505 let chunk = ContentChunk::new(
506 file_path.to_path_buf(),
507 ContentType::Configuration {
508 format: ConfigFormat::Json,
509 },
510 searchable_content,
511 span,
512 chunk_index,
513 )
514 .with_metadata(serde_json::json!({
515 "key_path": key_path,
516 "value": value_str,
517 "value_type": match value {
518 Value::String(_) => "string",
519 Value::Number(_) => "number",
520 Value::Bool(_) => "boolean",
521 _ => "unknown"
522 },
523 "config_type": "json"
524 }));
525
526 chunks.push(chunk);
527 }
528 }
529 Value::Null => {} }
531 }
532
533 fn parse_yaml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
535 let mut chunks = Vec::new();
537 let lines: Vec<&str> = content.lines().collect();
538
539 for (line_idx, line) in lines.iter().enumerate() {
540 let trimmed = line.trim();
541 if trimmed.is_empty() || trimmed.starts_with('#') {
542 continue;
543 }
544
545 if let Some(colon_pos) = trimmed.find(':') {
547 let key = trimmed[..colon_pos].trim();
548 let value = trimmed[colon_pos + 1..].trim();
549
550 if !value.is_empty() {
551 let span = self.calculate_line_span(line_idx, line, content);
552 let chunk = ContentChunk::new(
553 file_path.to_path_buf(),
554 ContentType::Configuration {
555 format: ConfigFormat::Yaml,
556 },
557 format!("{}: {}", key, value),
558 span,
559 chunks.len(),
560 )
561 .with_metadata(serde_json::json!({
562 "key": key,
563 "value": value,
564 "config_type": "yaml"
565 }));
566
567 chunks.push(chunk);
568 }
569 }
570 }
571
572 Ok(chunks)
573 }
574
575 fn parse_toml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
577 let mut chunks = Vec::new();
579 let lines: Vec<&str> = content.lines().collect();
580
581 for (line_idx, line) in lines.iter().enumerate() {
582 let trimmed = line.trim();
583 if trimmed.is_empty() || trimmed.starts_with('#') {
584 continue;
585 }
586
587 if trimmed.starts_with('[') && trimmed.ends_with(']') {
589 let section = &trimmed[1..trimmed.len() - 1];
590 let span = self.calculate_line_span(line_idx, line, content);
591 let chunk = ContentChunk::new(
592 file_path.to_path_buf(),
593 ContentType::Configuration {
594 format: ConfigFormat::Toml,
595 },
596 section.to_string(),
597 span,
598 chunks.len(),
599 )
600 .with_metadata(serde_json::json!({
601 "element_type": "section",
602 "section_name": section,
603 "config_type": "toml"
604 }));
605
606 chunks.push(chunk);
607 continue;
608 }
609
610 if let Some(eq_pos) = trimmed.find('=') {
612 let key = trimmed[..eq_pos].trim();
613 let value = trimmed[eq_pos + 1..].trim();
614
615 let span = self.calculate_line_span(line_idx, line, content);
616 let chunk = ContentChunk::new(
617 file_path.to_path_buf(),
618 ContentType::Configuration {
619 format: ConfigFormat::Toml,
620 },
621 format!("{} = {}", key, value),
622 span,
623 chunks.len(),
624 )
625 .with_metadata(serde_json::json!({
626 "key": key,
627 "value": value,
628 "config_type": "toml"
629 }));
630
631 chunks.push(chunk);
632 }
633 }
634
635 Ok(chunks)
636 }
637
638 fn parse_ini(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
640 self.parse_key_value_format(file_path, content, ConfigFormat::Ini, "ini")
642 }
643
644 fn parse_properties(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
646 self.parse_key_value_format(file_path, content, ConfigFormat::Properties, "properties")
647 }
648
649 fn parse_env(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
651 self.parse_key_value_format(file_path, content, ConfigFormat::Env, "env")
652 }
653
654 fn parse_xml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
656 let tag_regex = Regex::new(r"<([^/>]+)>([^<]+)</[^>]+>").unwrap();
658 let mut chunks = Vec::new();
659
660 for (idx, captures) in tag_regex.captures_iter(content).enumerate() {
661 let tag_name = captures.get(1).unwrap().as_str();
662 let tag_content = captures.get(2).unwrap().as_str().trim();
663
664 if !tag_content.is_empty() {
665 let full_match = captures.get(0).unwrap();
666 let span = self.calculate_match_span(&full_match, content);
667
668 let chunk = ContentChunk::new(
669 file_path.to_path_buf(),
670 ContentType::Configuration {
671 format: ConfigFormat::Xml,
672 },
673 tag_content.to_string(),
674 span,
675 idx,
676 )
677 .with_metadata(serde_json::json!({
678 "tag_name": tag_name,
679 "config_type": "xml"
680 }));
681
682 chunks.push(chunk);
683 }
684 }
685
686 Ok(chunks)
687 }
688
689 fn parse_key_value_format(
691 &self,
692 file_path: &Path,
693 content: &str,
694 format: ConfigFormat,
695 format_name: &str,
696 ) -> Result<Vec<ContentChunk>> {
697 let mut chunks = Vec::new();
698 let lines: Vec<&str> = content.lines().collect();
699
700 for (line_idx, line) in lines.iter().enumerate() {
701 let trimmed = line.trim();
702 if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with(';') {
703 continue;
704 }
705
706 if let Some(eq_pos) = trimmed.find('=') {
708 let key = trimmed[..eq_pos].trim();
709 let value = trimmed[eq_pos + 1..].trim();
710
711 let span = self.calculate_line_span(line_idx, line, content);
712 let chunk = ContentChunk::new(
713 file_path.to_path_buf(),
714 ContentType::Configuration {
715 format: format.clone(),
716 },
717 format!("{}={}", key, value),
718 span,
719 chunks.len(),
720 )
721 .with_metadata(serde_json::json!({
722 "key": key,
723 "value": value,
724 "config_type": format_name
725 }));
726
727 chunks.push(chunk);
728 }
729 }
730
731 Ok(chunks)
732 }
733
734 fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
736 let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
737 let start_byte = lines_before;
738 let end_byte = start_byte + line.len();
739
740 Span::new(
741 start_byte,
742 end_byte,
743 line_idx + 1,
744 line_idx + 1,
745 1,
746 line.len() + 1,
747 )
748 }
749
750 fn calculate_match_span(&self, match_obj: ®ex::Match, content: &str) -> Span {
752 let start_byte = match_obj.start();
753 let end_byte = match_obj.end();
754
755 let content_before = &content[..start_byte];
756 let start_line = content_before.lines().count();
757 let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
758
759 let match_content = match_obj.as_str();
760 let lines_in_match = match_content.lines().count();
761 let end_line = start_line + lines_in_match.saturating_sub(1);
762 let end_column = if lines_in_match > 1 {
763 match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
764 } else {
765 start_column + match_content.len()
766 };
767
768 Span::new(
769 start_byte,
770 end_byte,
771 start_line.max(1),
772 end_line.max(1),
773 start_column,
774 end_column,
775 )
776 }
777}
778
779impl Default for ConfigParser {
780 fn default() -> Self {
781 Self::new()
782 }
783}
784
785pub struct TextParser;
787
788impl TextParser {
789 pub fn new() -> Self {
791 Self
792 }
793
794 pub fn parse(
796 &self,
797 file_path: &Path,
798 content: &str,
799 format: DocumentFormat,
800 ) -> Result<Vec<ContentChunk>> {
801 let mut chunks = Vec::new();
802 let lines: Vec<&str> = content.lines().collect();
803
804 let mut paragraph_start = 0;
805 let mut paragraph_lines = Vec::new();
806 let mut chunk_index = 0;
807
808 for (line_idx, line) in lines.iter().enumerate() {
809 let trimmed = line.trim();
810
811 if trimmed.is_empty() {
812 if !paragraph_lines.is_empty() {
814 let paragraph_text = paragraph_lines.join("\n");
815 let span = self.calculate_paragraph_span(paragraph_start, line_idx - 1, &lines);
816
817 let chunk = ContentChunk::new(
818 file_path.to_path_buf(),
819 ContentType::Documentation {
820 format: format.clone(),
821 },
822 paragraph_text,
823 span,
824 chunk_index,
825 )
826 .with_metadata(serde_json::json!({
827 "element_type": "paragraph",
828 "line_count": paragraph_lines.len()
829 }));
830
831 chunks.push(chunk);
832 chunk_index += 1;
833 paragraph_lines.clear();
834 }
835 continue;
836 }
837
838 if paragraph_lines.is_empty() {
840 paragraph_start = line_idx;
841 }
842 paragraph_lines.push(trimmed);
843 }
844
845 if !paragraph_lines.is_empty() {
847 let paragraph_text = paragraph_lines.join("\n");
848 let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, &lines);
849
850 let chunk = ContentChunk::new(
851 file_path.to_path_buf(),
852 ContentType::Documentation { format },
853 paragraph_text,
854 span,
855 chunk_index,
856 )
857 .with_metadata(serde_json::json!({
858 "element_type": "paragraph",
859 "line_count": paragraph_lines.len()
860 }));
861
862 chunks.push(chunk);
863 }
864
865 Ok(chunks)
866 }
867
868 fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, lines: &[&str]) -> Span {
870 let start_byte: usize = lines
871 .iter()
872 .take(start_line)
873 .map(|l| l.len() + 1)
874 .sum::<usize>();
875 let end_byte: usize = lines
876 .iter()
877 .take(end_line + 1)
878 .map(|l| l.len() + 1)
879 .sum::<usize>()
880 - 1;
881
882 Span::new(
883 start_byte,
884 end_byte,
885 start_line + 1,
886 end_line + 1,
887 1,
888 lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
889 )
890 }
891}
892
893impl Default for TextParser {
894 fn default() -> Self {
895 Self::new()
896 }
897}
898
899#[cfg(test)]
900mod tests {
901 use super::*;
902
903 #[test]
904 fn test_document_parser_creation() {
905 let parser = DocumentParser::new();
906 assert!(true);
908 }
909
910 #[test]
911 fn test_content_type_detection() {
912 let parser = DocumentParser::new();
913
914 let test_cases = vec![
915 (
916 "test.md",
917 ContentType::Documentation {
918 format: DocumentFormat::Markdown,
919 },
920 ),
921 (
922 "README.markdown",
923 ContentType::Documentation {
924 format: DocumentFormat::Markdown,
925 },
926 ),
927 (
928 "doc.rst",
929 ContentType::Documentation {
930 format: DocumentFormat::RestructuredText,
931 },
932 ),
933 (
934 "manual.adoc",
935 ContentType::Documentation {
936 format: DocumentFormat::AsciiDoc,
937 },
938 ),
939 (
940 "page.html",
941 ContentType::Documentation {
942 format: DocumentFormat::Html,
943 },
944 ),
945 (
946 "notes.txt",
947 ContentType::Documentation {
948 format: DocumentFormat::PlainText,
949 },
950 ),
951 (
952 "config.json",
953 ContentType::Configuration {
954 format: ConfigFormat::Json,
955 },
956 ),
957 (
958 "config.yaml",
959 ContentType::Configuration {
960 format: ConfigFormat::Yaml,
961 },
962 ),
963 (
964 "config.yml",
965 ContentType::Configuration {
966 format: ConfigFormat::Yaml,
967 },
968 ),
969 (
970 "Cargo.toml",
971 ContentType::Configuration {
972 format: ConfigFormat::Toml,
973 },
974 ),
975 (
976 "settings.ini",
977 ContentType::Configuration {
978 format: ConfigFormat::Ini,
979 },
980 ),
981 (
982 "app.properties",
983 ContentType::Configuration {
984 format: ConfigFormat::Properties,
985 },
986 ),
987 (
988 ".env",
989 ContentType::Configuration {
990 format: ConfigFormat::Env,
991 },
992 ),
993 (
994 "config.xml",
995 ContentType::Configuration {
996 format: ConfigFormat::Xml,
997 },
998 ),
999 ("unknown.xyz", ContentType::PlainText),
1000 ];
1001
1002 for (filename, expected_type) in test_cases {
1003 let path = Path::new(filename);
1004 let detected_type = parser.detect_content_type(path).unwrap();
1005 assert_eq!(
1006 std::mem::discriminant(&detected_type),
1007 std::mem::discriminant(&expected_type),
1008 "Failed for file: {}",
1009 filename
1010 );
1011 }
1012 }
1013
1014 #[test]
1015 fn test_markdown_parser_headers() {
1016 let parser = MarkdownParser::new();
1017 let content = r#"# Main Title
1018Some content here.
1019
1020## Secondary Title
1021More content.
1022
1023### Subsection
1024Even more content.
1025
1026#### Level 4
1027Content at level 4.
1028
1029##### Level 5
1030Content at level 5.
1031
1032###### Level 6
1033Content at level 6."#;
1034
1035 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1036
1037 let headers: Vec<_> = chunks
1039 .iter()
1040 .filter(|chunk| {
1041 if let Some(metadata) = chunk.metadata.as_object() {
1042 metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1043 } else {
1044 false
1045 }
1046 })
1047 .collect();
1048
1049 assert_eq!(headers.len(), 6, "Should find 6 headers");
1050
1051 let header_levels: Vec<_> = headers
1053 .iter()
1054 .filter_map(|chunk| {
1055 chunk
1056 .metadata
1057 .as_object()
1058 .and_then(|m| m.get("header_level"))
1059 .and_then(|v| v.as_u64())
1060 })
1061 .collect();
1062
1063 assert_eq!(header_levels, vec![1, 2, 3, 4, 5, 6]);
1064 assert_eq!(headers[0].content, "Main Title");
1065 assert_eq!(headers[1].content, "Secondary Title");
1066 assert_eq!(headers[2].content, "Subsection");
1067 }
1068
1069 #[test]
1070 fn test_markdown_parser_code_blocks() {
1071 let parser = MarkdownParser::new();
1072 let content = r#"Here is some Python code:
1073
1074```python
1075def hello_world():
1076 print("Hello, World!")
1077 return "success"
1078```
1079
1080And here is some JavaScript:
1081
1082```javascript
1083function greet(name) {
1084 console.log(`Hello, ${name}!`);
1085}
1086```
1087
1088And a generic code block:
1089
1090```
1091generic code here
1092no language specified
1093```"#;
1094
1095 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1096
1097 let code_blocks: Vec<_> = chunks
1098 .iter()
1099 .filter(|chunk| {
1100 if let Some(metadata) = chunk.metadata.as_object() {
1101 metadata.get("element_type").and_then(|v| v.as_str()) == Some("code_block")
1102 } else {
1103 false
1104 }
1105 })
1106 .collect();
1107
1108 assert_eq!(code_blocks.len(), 3, "Should find 3 code blocks");
1109
1110 assert!(code_blocks[0].content.contains("def hello_world"));
1112 assert!(code_blocks[0].content.contains("print(\"Hello, World!\")"));
1113 let python_lang = code_blocks[0]
1114 .metadata
1115 .as_object()
1116 .unwrap()
1117 .get("language")
1118 .unwrap()
1119 .as_str()
1120 .unwrap();
1121 assert_eq!(python_lang, "python");
1122
1123 assert!(code_blocks[1].content.contains("function greet"));
1125 let js_lang = code_blocks[1]
1126 .metadata
1127 .as_object()
1128 .unwrap()
1129 .get("language")
1130 .unwrap()
1131 .as_str()
1132 .unwrap();
1133 assert_eq!(js_lang, "javascript");
1134
1135 assert!(code_blocks[2].content.contains("generic code here"));
1137 let generic_lang = code_blocks[2]
1138 .metadata
1139 .as_object()
1140 .unwrap()
1141 .get("language")
1142 .unwrap()
1143 .as_str()
1144 .unwrap();
1145 assert_eq!(generic_lang, "text");
1146 }
1147
1148 #[test]
1149 fn test_markdown_parser_paragraphs() {
1150 let parser = MarkdownParser::new();
1151 let content = r#"This is the first paragraph with some content.
1152It spans multiple lines.
1153
1154This is the second paragraph.
1155
1156# A Header
1157
1158This is a paragraph after a header.
1159
1160Another paragraph here."#;
1161
1162 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1163
1164 let paragraphs: Vec<_> = chunks
1165 .iter()
1166 .filter(|chunk| {
1167 if let Some(metadata) = chunk.metadata.as_object() {
1168 metadata.get("element_type").and_then(|v| v.as_str()) == Some("paragraph")
1169 } else {
1170 false
1171 }
1172 })
1173 .collect();
1174
1175 assert!(paragraphs.len() >= 3, "Should find at least 3 paragraphs");
1176 assert!(paragraphs[0].content.contains("first paragraph"));
1177 assert!(paragraphs[1].content.contains("second paragraph"));
1178 }
1179
1180 #[test]
1181 fn test_json_config_parser() {
1182 let parser = ConfigParser::new();
1183 let content = r#"{
1184 "database": {
1185 "host": "localhost",
1186 "port": 5432,
1187 "name": "myapp"
1188 },
1189 "features": ["auth", "logging", "metrics"],
1190 "debug": true,
1191 "version": "1.0.0"
1192}"#;
1193
1194 let chunks = parser
1195 .parse(Path::new("config.json"), content, ConfigFormat::Json)
1196 .unwrap();
1197
1198 assert!(!chunks.is_empty(), "Should extract chunks from JSON");
1199
1200 let string_chunks: Vec<_> = chunks
1202 .iter()
1203 .filter(|chunk| {
1204 if let Some(metadata) = chunk.metadata.as_object() {
1205 metadata.get("value_type").and_then(|v| v.as_str()) == Some("string")
1206 } else {
1207 false
1208 }
1209 })
1210 .collect();
1211
1212 let boolean_chunks: Vec<_> = chunks
1213 .iter()
1214 .filter(|chunk| {
1215 if let Some(metadata) = chunk.metadata.as_object() {
1216 metadata.get("value_type").and_then(|v| v.as_str()) == Some("boolean")
1217 } else {
1218 false
1219 }
1220 })
1221 .collect();
1222
1223 assert!(!string_chunks.is_empty(), "Should find string values");
1224 assert!(!boolean_chunks.is_empty(), "Should find boolean values");
1225 }
1226
1227 #[test]
1228 fn test_yaml_config_parser() {
1229 let parser = ConfigParser::new();
1230 let content = r#"database:
1231 host: localhost
1232 port: 5432
1233 name: myapp
1234
1235features:
1236 - auth
1237 - logging
1238 - metrics
1239
1240debug: true
1241version: "1.0.0"
1242"#;
1243
1244 let chunks = parser
1245 .parse(Path::new("config.yaml"), content, ConfigFormat::Yaml)
1246 .unwrap();
1247
1248 assert!(!chunks.is_empty(), "Should extract chunks from YAML");
1249
1250 let has_database = chunks
1252 .iter()
1253 .any(|chunk| chunk.content.contains("host: localhost"));
1254 let has_debug = chunks
1255 .iter()
1256 .any(|chunk| chunk.content.contains("debug: true"));
1257
1258 assert!(has_database, "Should find database configuration");
1259 assert!(has_debug, "Should find debug setting");
1260 }
1261
1262 #[test]
1263 fn test_toml_config_parser() {
1264 let parser = ConfigParser::new();
1265 let content = r#"[database]
1266host = "localhost"
1267port = 5432
1268name = "myapp"
1269
1270[features]
1271auth = true
1272logging = true
1273metrics = false
1274
1275debug = true
1276version = "1.0.0"
1277"#;
1278
1279 let chunks = parser
1280 .parse(Path::new("Cargo.toml"), content, ConfigFormat::Toml)
1281 .unwrap();
1282
1283 assert!(!chunks.is_empty(), "Should extract chunks from TOML");
1284
1285 let sections: Vec<_> = chunks
1287 .iter()
1288 .filter(|chunk| {
1289 if let Some(metadata) = chunk.metadata.as_object() {
1290 metadata.get("element_type").and_then(|v| v.as_str()) == Some("section")
1291 } else {
1292 false
1293 }
1294 })
1295 .collect();
1296
1297 assert!(sections.len() >= 2, "Should find at least 2 sections");
1298 assert!(sections.iter().any(|s| s.content == "database"));
1299 assert!(sections.iter().any(|s| s.content == "features"));
1300
1301 let key_values: Vec<_> = chunks
1302 .iter()
1303 .filter(|chunk| chunk.content.contains(" = "))
1304 .collect();
1305
1306 assert!(!key_values.is_empty(), "Should find key-value pairs");
1307 }
1308
1309 #[test]
1310 fn test_ini_config_parser() {
1311 let parser = ConfigParser::new();
1312 let content = r#"[database]
1313host=localhost
1314port=5432
1315name=myapp
1316
1317[logging]
1318level=info
1319file=/var/log/app.log
1320
1321debug=true
1322"#;
1323
1324 let chunks = parser
1325 .parse(Path::new("config.ini"), content, ConfigFormat::Ini)
1326 .unwrap();
1327
1328 assert!(!chunks.is_empty(), "Should extract chunks from INI");
1329
1330 let key_values: Vec<_> = chunks
1331 .iter()
1332 .filter(|chunk| chunk.content.contains("="))
1333 .collect();
1334
1335 assert!(
1336 key_values.len() >= 5,
1337 "Should find multiple key-value pairs"
1338 );
1339 assert!(key_values
1340 .iter()
1341 .any(|kv| kv.content.contains("host=localhost")));
1342 assert!(key_values
1343 .iter()
1344 .any(|kv| kv.content.contains("level=info")));
1345 }
1346
1347 #[test]
1348 fn test_properties_config_parser() {
1349 let parser = ConfigParser::new();
1350 let content = r#"# Application configuration
1351database.host=localhost
1352database.port=5432
1353database.name=myapp
1354
1355# Logging configuration
1356logging.level=info
1357logging.file=/var/log/app.log
1358
1359debug=true
1360"#;
1361
1362 let chunks = parser
1363 .parse(
1364 Path::new("app.properties"),
1365 content,
1366 ConfigFormat::Properties,
1367 )
1368 .unwrap();
1369
1370 assert!(!chunks.is_empty(), "Should extract chunks from properties");
1371
1372 let properties: Vec<_> = chunks
1373 .iter()
1374 .filter(|chunk| chunk.content.contains("="))
1375 .collect();
1376
1377 assert!(properties.len() >= 5, "Should find multiple properties");
1378 assert!(properties
1379 .iter()
1380 .any(|p| p.content.contains("database.host=localhost")));
1381 assert!(properties
1382 .iter()
1383 .any(|p| p.content.contains("logging.level=info")));
1384 }
1385
1386 #[test]
1387 fn test_env_config_parser() {
1388 let parser = ConfigParser::new();
1389 let content = r#"DATABASE_HOST=localhost
1390DATABASE_PORT=5432
1391DATABASE_NAME=myapp
1392DEBUG=true
1393SECRET_KEY=abc123xyz
1394"#;
1395
1396 let chunks = parser
1397 .parse(Path::new(".env"), content, ConfigFormat::Env)
1398 .unwrap();
1399
1400 assert!(!chunks.is_empty(), "Should extract chunks from env file");
1401
1402 let env_vars: Vec<_> = chunks
1403 .iter()
1404 .filter(|chunk| chunk.content.contains("="))
1405 .collect();
1406
1407 assert_eq!(env_vars.len(), 5, "Should find 5 environment variables");
1408 assert!(env_vars
1409 .iter()
1410 .any(|var| var.content.contains("DATABASE_HOST=localhost")));
1411 assert!(env_vars
1412 .iter()
1413 .any(|var| var.content.contains("DEBUG=true")));
1414 }
1415
1416 #[test]
1417 fn test_xml_config_parser() {
1418 let parser = ConfigParser::new();
1419 let content = r#"<configuration>
1420 <database>
1421 <host>localhost</host>
1422 <port>5432</port>
1423 <name>myapp</name>
1424 </database>
1425 <features>
1426 <auth>true</auth>
1427 <logging>true</logging>
1428 </features>
1429 <debug>true</debug>
1430</configuration>"#;
1431
1432 let chunks = parser
1433 .parse(Path::new("config.xml"), content, ConfigFormat::Xml)
1434 .unwrap();
1435
1436 assert!(!chunks.is_empty(), "Should extract chunks from XML");
1437
1438 let tag_contents: Vec<_> = chunks
1440 .iter()
1441 .filter(|chunk| !chunk.content.trim().is_empty())
1442 .collect();
1443
1444 assert!(!tag_contents.is_empty(), "Should find tag contents");
1445 assert!(tag_contents.iter().any(|tag| tag.content == "localhost"));
1446 assert!(tag_contents.iter().any(|tag| tag.content == "5432"));
1447 assert!(tag_contents.iter().any(|tag| tag.content == "true"));
1448 }
1449
1450 #[test]
1451 fn test_text_parser_paragraphs() {
1452 let parser = TextParser::new();
1453 let content = r#"This is the first paragraph.
1454It has multiple lines.
1455
1456This is the second paragraph.
1457
1458This is the third paragraph.
1459It also has multiple lines.
1460And even more lines."#;
1461
1462 let chunks = parser
1463 .parse(
1464 Path::new("document.txt"),
1465 content,
1466 DocumentFormat::PlainText,
1467 )
1468 .unwrap();
1469
1470 assert_eq!(chunks.len(), 3, "Should find 3 paragraphs");
1471
1472 assert!(chunks[0].content.contains("first paragraph"));
1473 assert!(chunks[1].content.contains("second paragraph"));
1474 assert!(chunks[2].content.contains("third paragraph"));
1475
1476 for chunk in &chunks {
1478 let metadata = chunk.metadata.as_object().unwrap();
1479 assert_eq!(
1480 metadata.get("element_type").unwrap().as_str().unwrap(),
1481 "paragraph"
1482 );
1483 assert!(metadata.get("line_count").unwrap().as_u64().unwrap() >= 1);
1484 }
1485 }
1486
1487 #[test]
1488 fn test_invalid_json_handling() {
1489 let parser = ConfigParser::new();
1490 let invalid_json = r#"{ invalid json content here"#;
1491
1492 let chunks = parser
1493 .parse(Path::new("bad.json"), invalid_json, ConfigFormat::Json)
1494 .unwrap();
1495
1496 assert_eq!(
1497 chunks.len(),
1498 1,
1499 "Should create a single chunk for invalid JSON"
1500 );
1501 assert_eq!(chunks[0].content, invalid_json);
1502
1503 let metadata = chunks[0].metadata.as_object().unwrap();
1504 assert_eq!(
1505 metadata.get("parse_error").unwrap().as_bool().unwrap(),
1506 true
1507 );
1508 assert_eq!(
1509 metadata.get("config_type").unwrap().as_str().unwrap(),
1510 "json"
1511 );
1512 }
1513
1514 #[test]
1515 fn test_empty_content_handling() {
1516 let parser = DocumentParser::new();
1517
1518 let empty_md = "";
1519 let node = parser.parse_file(Path::new("empty.md"), empty_md).unwrap();
1520
1521 assert_eq!(
1522 node.chunks.len(),
1523 0,
1524 "Empty content should produce no chunks"
1525 );
1526 assert_eq!(node.file_size, 0);
1527 }
1528
1529 #[test]
1530 fn test_large_content_handling() {
1531 let parser = DocumentParser::new();
1532
1533 let mut content = String::new();
1535 for i in 0..100 {
1536 content.push_str(&format!(
1537 "# Header {}\n\nThis is paragraph {} with some content.\n\n",
1538 i, i
1539 ));
1540 }
1541
1542 let node = parser.parse_file(Path::new("large.md"), &content).unwrap();
1543
1544 assert!(node.chunks.len() >= 100, "Should handle large content");
1545 assert_eq!(node.file_size, content.len());
1546
1547 let headers = node
1549 .chunks
1550 .iter()
1551 .filter(|chunk| {
1552 if let Some(metadata) = chunk.metadata.as_object() {
1553 metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1554 } else {
1555 false
1556 }
1557 })
1558 .count();
1559
1560 assert!(headers >= 100, "Should find many headers");
1561 }
1562
1563 #[test]
1564 fn test_content_span_calculation() {
1565 let parser = MarkdownParser::new();
1566 let content = "# Title\nSome content.";
1567
1568 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1569
1570 for chunk in chunks {
1571 assert!(
1572 chunk.span.start_byte < chunk.span.end_byte,
1573 "Start should be before end"
1574 );
1575 assert!(
1576 chunk.span.start_line <= chunk.span.end_line,
1577 "Start line should be <= end line"
1578 );
1579 assert!(chunk.span.start_column >= 1, "Column should be 1-indexed");
1580 assert!(
1581 chunk.span.end_byte <= content.len(),
1582 "End should not exceed content length"
1583 );
1584 }
1585 }
1586}