1use super::{ConfigFormat, ContentChunk, ContentNode, ContentType, DocumentFormat};
7use crate::ast::Span;
8use anyhow::{anyhow, Result};
9use regex::Regex;
10use serde_json::Value;
11use std::path::Path;
12
13pub struct DocumentParser {
15 markdown_parser: MarkdownParser,
17 config_parser: ConfigParser,
19 text_parser: TextParser,
21}
22
23impl DocumentParser {
24 pub fn new() -> Self {
26 Self {
27 markdown_parser: MarkdownParser::new(),
28 config_parser: ConfigParser::new(),
29 text_parser: TextParser::new(),
30 }
31 }
32
33 pub fn parse_file(&self, file_path: &Path, content: &str) -> Result<ContentNode> {
35 let content_type = self.detect_content_type(file_path)?;
36 let mut node = ContentNode::new(file_path.to_path_buf(), content_type.clone());
37
38 let chunks = match content_type {
39 ContentType::Documentation { format } => match format {
40 DocumentFormat::Markdown => self.markdown_parser.parse(file_path, content)?,
41 DocumentFormat::PlainText
42 | DocumentFormat::RestructuredText
43 | DocumentFormat::AsciiDoc
44 | DocumentFormat::Html => self.text_parser.parse(file_path, content, format)?,
45 },
46 ContentType::Configuration { format } => {
47 self.config_parser.parse(file_path, content, format)?
48 }
49 ContentType::PlainText => {
50 self.text_parser
51 .parse(file_path, content, DocumentFormat::PlainText)?
52 }
53 _ => return Err(anyhow!("Unsupported content type for document parser")),
54 };
55
56 for chunk in chunks {
57 node.add_chunk(chunk);
58 }
59 node.file_size = content.len();
60
61 Ok(node)
62 }
63
64 fn detect_content_type(&self, file_path: &Path) -> Result<ContentType> {
66 if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
68 if file_name == ".env" {
69 return Ok(ContentType::Configuration {
70 format: ConfigFormat::Env,
71 });
72 }
73 }
74
75 let extension = file_path
76 .extension()
77 .and_then(|ext| ext.to_str())
78 .unwrap_or("")
79 .to_lowercase();
80
81 match extension.as_str() {
82 "md" | "markdown" => Ok(ContentType::Documentation {
83 format: DocumentFormat::Markdown,
84 }),
85 "rst" => Ok(ContentType::Documentation {
86 format: DocumentFormat::RestructuredText,
87 }),
88 "adoc" | "asciidoc" => Ok(ContentType::Documentation {
89 format: DocumentFormat::AsciiDoc,
90 }),
91 "html" | "htm" => Ok(ContentType::Documentation {
92 format: DocumentFormat::Html,
93 }),
94 "txt" | "text" => Ok(ContentType::Documentation {
95 format: DocumentFormat::PlainText,
96 }),
97 "json" => Ok(ContentType::Configuration {
98 format: ConfigFormat::Json,
99 }),
100 "yaml" | "yml" => Ok(ContentType::Configuration {
101 format: ConfigFormat::Yaml,
102 }),
103 "toml" => Ok(ContentType::Configuration {
104 format: ConfigFormat::Toml,
105 }),
106 "ini" => Ok(ContentType::Configuration {
107 format: ConfigFormat::Ini,
108 }),
109 "properties" => Ok(ContentType::Configuration {
110 format: ConfigFormat::Properties,
111 }),
112 "env" => Ok(ContentType::Configuration {
113 format: ConfigFormat::Env,
114 }),
115 "xml" => Ok(ContentType::Configuration {
116 format: ConfigFormat::Xml,
117 }),
118 _ => Ok(ContentType::PlainText),
119 }
120 }
121}
122
123impl Default for DocumentParser {
124 fn default() -> Self {
125 Self::new()
126 }
127}
128
129pub struct MarkdownParser {
131 header_regex: Regex,
133 code_block_regex: Regex,
135 #[allow(dead_code)] inline_code_regex: Regex,
138 #[allow(dead_code)] link_regex: Regex,
141 #[allow(dead_code)] list_regex: Regex,
144}
145
146impl MarkdownParser {
147 pub fn new() -> Self {
149 Self {
150 header_regex: Regex::new(r"(?m)^(#{1,6})\s+(.+)$").unwrap(),
151 code_block_regex: Regex::new(r"```(\w+)?\n([\s\S]*?)\n```").unwrap(),
152 inline_code_regex: Regex::new(r"`([^`]+)`").unwrap(),
153 link_regex: Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap(),
154 list_regex: Regex::new(r"(?m)^[\s]*[-*+]\s+(.+)$").unwrap(),
155 }
156 }
157
158 pub fn parse(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
160 let mut chunks = Vec::new();
161 let lines: Vec<&str> = content.lines().collect();
162 let mut _current_line = 0;
163 let mut chunk_index = 0;
164
165 for (line_idx, line) in lines.iter().enumerate() {
167 if let Some(captures) = self.header_regex.captures(line) {
168 let level = captures.get(1).unwrap().as_str().len();
169 let header_text = captures.get(2).unwrap().as_str();
170
171 let span = self.calculate_line_span(line_idx, line, content);
172 let chunk = ContentChunk::new(
173 file_path.to_path_buf(),
174 ContentType::Documentation {
175 format: DocumentFormat::Markdown,
176 },
177 header_text.to_string(),
178 span,
179 chunk_index,
180 )
181 .with_metadata(serde_json::json!({
182 "header_level": level,
183 "element_type": "header"
184 }));
185
186 chunks.push(chunk);
187 chunk_index += 1;
188 }
189 }
190
191 for captures in self.code_block_regex.captures_iter(content) {
193 let language = captures.get(1).map(|m| m.as_str()).unwrap_or("text");
194 let code_content = captures.get(2).unwrap().as_str();
195 let full_match = captures.get(0).unwrap();
196
197 let span = self.calculate_match_span(&full_match, content);
198 let chunk = ContentChunk::new(
199 file_path.to_path_buf(),
200 ContentType::Documentation {
201 format: DocumentFormat::Markdown,
202 },
203 code_content.to_string(),
204 span,
205 chunk_index,
206 )
207 .with_metadata(serde_json::json!({
208 "language": language,
209 "element_type": "code_block"
210 }));
211
212 chunks.push(chunk);
213 chunk_index += 1;
214 }
215
216 let mut paragraph_start = 0;
218 let mut in_paragraph = false;
219 let mut paragraph_lines = Vec::new();
220
221 for (line_idx, line) in lines.iter().enumerate() {
222 let line_trimmed = line.trim();
223
224 if self.header_regex.is_match(line)
226 || line_trimmed.starts_with("```")
227 || line_trimmed.is_empty()
228 {
229 if in_paragraph && !paragraph_lines.is_empty() {
231 let paragraph_text = paragraph_lines.join("\n");
232 let span =
233 self.calculate_paragraph_span(paragraph_start, line_idx - 1, content);
234
235 let chunk = ContentChunk::new(
236 file_path.to_path_buf(),
237 ContentType::Documentation {
238 format: DocumentFormat::Markdown,
239 },
240 paragraph_text,
241 span,
242 chunk_index,
243 )
244 .with_metadata(serde_json::json!({
245 "element_type": "paragraph"
246 }));
247
248 chunks.push(chunk);
249 chunk_index += 1;
250 }
251
252 in_paragraph = false;
253 paragraph_lines.clear();
254 continue;
255 }
256
257 if !in_paragraph {
259 in_paragraph = true;
260 paragraph_start = line_idx;
261 }
262 paragraph_lines.push(line_trimmed);
263 }
264
265 if in_paragraph && !paragraph_lines.is_empty() {
267 let paragraph_text = paragraph_lines.join("\n");
268 let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, content);
269
270 let chunk = ContentChunk::new(
271 file_path.to_path_buf(),
272 ContentType::Documentation {
273 format: DocumentFormat::Markdown,
274 },
275 paragraph_text,
276 span,
277 chunk_index,
278 )
279 .with_metadata(serde_json::json!({
280 "element_type": "paragraph"
281 }));
282
283 chunks.push(chunk);
284 }
285
286 Ok(chunks)
287 }
288
289 fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
291 let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
292 let start_byte = lines_before;
293 let end_byte = start_byte + line.len();
294
295 Span::new(
296 start_byte,
297 end_byte,
298 line_idx + 1,
299 line_idx + 1,
300 1,
301 line.len() + 1,
302 )
303 }
304
305 fn calculate_match_span(&self, match_obj: ®ex::Match, content: &str) -> Span {
307 let start_byte = match_obj.start();
308 let end_byte = match_obj.end();
309
310 let content_before = &content[..start_byte];
312 let start_line = content_before.lines().count();
313 let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
314
315 let match_content = match_obj.as_str();
317 let lines_in_match = match_content.lines().count();
318 let end_line = start_line + lines_in_match.saturating_sub(1);
319 let end_column = if lines_in_match > 1 {
320 match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
321 } else {
322 start_column + match_content.len()
323 };
324
325 Span::new(
326 start_byte,
327 end_byte,
328 start_line.max(1),
329 end_line.max(1),
330 start_column,
331 end_column,
332 )
333 }
334
335 fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, content: &str) -> Span {
337 let lines: Vec<&str> = content.lines().collect();
338 let start_byte: usize = lines
339 .iter()
340 .take(start_line)
341 .map(|l| l.len() + 1)
342 .sum::<usize>();
343 let end_byte: usize = lines
344 .iter()
345 .take(end_line + 1)
346 .map(|l| l.len() + 1)
347 .sum::<usize>()
348 - 1;
349
350 Span::new(
351 start_byte,
352 end_byte,
353 start_line + 1,
354 end_line + 1,
355 1,
356 lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
357 )
358 }
359}
360
361impl Default for MarkdownParser {
362 fn default() -> Self {
363 Self::new()
364 }
365}
366
367pub struct ConfigParser;
369
370impl ConfigParser {
371 pub fn new() -> Self {
373 Self
374 }
375
376 pub fn parse(
378 &self,
379 file_path: &Path,
380 content: &str,
381 format: ConfigFormat,
382 ) -> Result<Vec<ContentChunk>> {
383 match format {
384 ConfigFormat::Json => self.parse_json(file_path, content),
385 ConfigFormat::Yaml => self.parse_yaml(file_path, content),
386 ConfigFormat::Toml => self.parse_toml(file_path, content),
387 ConfigFormat::Ini => self.parse_ini(file_path, content),
388 ConfigFormat::Properties => self.parse_properties(file_path, content),
389 ConfigFormat::Env => self.parse_env(file_path, content),
390 ConfigFormat::Xml => self.parse_xml(file_path, content),
391 }
392 }
393
394 fn parse_json(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
396 let mut chunks = Vec::new();
397
398 match serde_json::from_str::<Value>(content) {
400 Ok(value) => {
401 self.extract_json_values(&value, file_path, content, &mut chunks, 0, "");
403 }
404 Err(_) => {
405 chunks.push(
407 ContentChunk::new(
408 file_path.to_path_buf(),
409 ContentType::Configuration {
410 format: ConfigFormat::Json,
411 },
412 content.to_string(),
413 Span::new(
414 0,
415 content.len(),
416 1,
417 content.lines().count(),
418 1,
419 content.lines().last().map(|l| l.len()).unwrap_or(0),
420 ),
421 0,
422 )
423 .with_metadata(serde_json::json!({
424 "parse_error": true,
425 "config_type": "json"
426 })),
427 );
428 }
429 }
430
431 Ok(chunks)
432 }
433
434 #[allow(clippy::only_used_in_recursion)] fn extract_json_values(
437 &self,
438 value: &Value,
439 file_path: &Path,
440 content: &str,
441 chunks: &mut Vec<ContentChunk>,
442 chunk_index: usize,
443 key_path: &str,
444 ) {
445 match value {
446 Value::Object(map) => {
447 for (key, val) in map {
448 let new_path = if key_path.is_empty() {
449 key.clone()
450 } else {
451 format!("{key_path}.{key}")
452 };
453 self.extract_json_values(
454 val,
455 file_path,
456 content,
457 chunks,
458 chunks.len(),
459 &new_path,
460 );
461 }
462 }
463 Value::Array(arr) => {
464 for (index, val) in arr.iter().enumerate() {
465 let new_path = format!("{key_path}[{index}]");
466 self.extract_json_values(
467 val,
468 file_path,
469 content,
470 chunks,
471 chunks.len(),
472 &new_path,
473 );
474 }
475 }
476 Value::String(_) | Value::Number(_) | Value::Bool(_) => {
477 let value_str = match value {
479 Value::String(s) => s.clone(),
480 _ => value.to_string(),
481 };
482
483 let searchable_content = if key_path.is_empty() {
485 value_str.clone()
486 } else {
487 format!("{key_path}: {value_str}")
488 };
489
490 if let Some(position) = content.find(&value_str) {
492 let lines_before = content[..position].lines().count();
493 let line_start = content[..position].rfind('\n').map(|i| i + 1).unwrap_or(0);
494 let column = position - line_start + 1;
495
496 let span = Span::new(
497 position,
498 position + value_str.len(),
499 lines_before.max(1),
500 lines_before.max(1),
501 column,
502 column + value_str.len(),
503 );
504
505 let chunk = ContentChunk::new(
506 file_path.to_path_buf(),
507 ContentType::Configuration {
508 format: ConfigFormat::Json,
509 },
510 searchable_content,
511 span,
512 chunk_index,
513 )
514 .with_metadata(serde_json::json!({
515 "key_path": key_path,
516 "value": value_str,
517 "value_type": match value {
518 Value::String(_) => "string",
519 Value::Number(_) => "number",
520 Value::Bool(_) => "boolean",
521 _ => "unknown"
522 },
523 "config_type": "json"
524 }));
525
526 chunks.push(chunk);
527 }
528 }
529 Value::Null => {} }
531 }
532
533 fn parse_yaml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
535 let mut chunks = Vec::new();
537 let lines: Vec<&str> = content.lines().collect();
538
539 for (line_idx, line) in lines.iter().enumerate() {
540 let trimmed = line.trim();
541 if trimmed.is_empty() || trimmed.starts_with('#') {
542 continue;
543 }
544
545 if let Some(colon_pos) = trimmed.find(':') {
547 let key = trimmed[..colon_pos].trim();
548 let value = trimmed[colon_pos + 1..].trim();
549
550 if !value.is_empty() {
551 let span = self.calculate_line_span(line_idx, line, content);
552 let chunk = ContentChunk::new(
553 file_path.to_path_buf(),
554 ContentType::Configuration {
555 format: ConfigFormat::Yaml,
556 },
557 format!("{key}: {value}"),
558 span,
559 chunks.len(),
560 )
561 .with_metadata(serde_json::json!({
562 "key": key,
563 "value": value,
564 "config_type": "yaml"
565 }));
566
567 chunks.push(chunk);
568 }
569 }
570 }
571
572 Ok(chunks)
573 }
574
575 fn parse_toml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
577 let mut chunks = Vec::new();
579 let lines: Vec<&str> = content.lines().collect();
580
581 for (line_idx, line) in lines.iter().enumerate() {
582 let trimmed = line.trim();
583 if trimmed.is_empty() || trimmed.starts_with('#') {
584 continue;
585 }
586
587 if trimmed.starts_with('[') && trimmed.ends_with(']') {
589 let section = &trimmed[1..trimmed.len() - 1];
590 let span = self.calculate_line_span(line_idx, line, content);
591 let chunk = ContentChunk::new(
592 file_path.to_path_buf(),
593 ContentType::Configuration {
594 format: ConfigFormat::Toml,
595 },
596 section.to_string(),
597 span,
598 chunks.len(),
599 )
600 .with_metadata(serde_json::json!({
601 "element_type": "section",
602 "section_name": section,
603 "config_type": "toml"
604 }));
605
606 chunks.push(chunk);
607 continue;
608 }
609
610 if let Some(eq_pos) = trimmed.find('=') {
612 let key = trimmed[..eq_pos].trim();
613 let value = trimmed[eq_pos + 1..].trim();
614
615 let span = self.calculate_line_span(line_idx, line, content);
616 let chunk = ContentChunk::new(
617 file_path.to_path_buf(),
618 ContentType::Configuration {
619 format: ConfigFormat::Toml,
620 },
621 format!("{key} = {value}"),
622 span,
623 chunks.len(),
624 )
625 .with_metadata(serde_json::json!({
626 "key": key,
627 "value": value,
628 "config_type": "toml"
629 }));
630
631 chunks.push(chunk);
632 }
633 }
634
635 Ok(chunks)
636 }
637
638 fn parse_ini(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
640 self.parse_key_value_format(file_path, content, ConfigFormat::Ini, "ini")
642 }
643
644 fn parse_properties(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
646 self.parse_key_value_format(file_path, content, ConfigFormat::Properties, "properties")
647 }
648
649 fn parse_env(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
651 self.parse_key_value_format(file_path, content, ConfigFormat::Env, "env")
652 }
653
654 fn parse_xml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
656 let tag_regex = Regex::new(r"<([^/>]+)>([^<]+)</[^>]+>").unwrap();
658 let mut chunks = Vec::new();
659
660 for (idx, captures) in tag_regex.captures_iter(content).enumerate() {
661 let tag_name = captures.get(1).unwrap().as_str();
662 let tag_content = captures.get(2).unwrap().as_str().trim();
663
664 if !tag_content.is_empty() {
665 let full_match = captures.get(0).unwrap();
666 let span = self.calculate_match_span(&full_match, content);
667
668 let chunk = ContentChunk::new(
669 file_path.to_path_buf(),
670 ContentType::Configuration {
671 format: ConfigFormat::Xml,
672 },
673 tag_content.to_string(),
674 span,
675 idx,
676 )
677 .with_metadata(serde_json::json!({
678 "tag_name": tag_name,
679 "config_type": "xml"
680 }));
681
682 chunks.push(chunk);
683 }
684 }
685
686 Ok(chunks)
687 }
688
689 fn parse_key_value_format(
691 &self,
692 file_path: &Path,
693 content: &str,
694 format: ConfigFormat,
695 format_name: &str,
696 ) -> Result<Vec<ContentChunk>> {
697 let mut chunks = Vec::new();
698 let lines: Vec<&str> = content.lines().collect();
699
700 for (line_idx, line) in lines.iter().enumerate() {
701 let trimmed = line.trim();
702 if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with(';') {
703 continue;
704 }
705
706 if let Some(eq_pos) = trimmed.find('=') {
708 let key = trimmed[..eq_pos].trim();
709 let value = trimmed[eq_pos + 1..].trim();
710
711 let span = self.calculate_line_span(line_idx, line, content);
712 let chunk = ContentChunk::new(
713 file_path.to_path_buf(),
714 ContentType::Configuration {
715 format: format.clone(),
716 },
717 format!("{key}={value}"),
718 span,
719 chunks.len(),
720 )
721 .with_metadata(serde_json::json!({
722 "key": key,
723 "value": value,
724 "config_type": format_name
725 }));
726
727 chunks.push(chunk);
728 }
729 }
730
731 Ok(chunks)
732 }
733
734 fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
736 let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
737 let start_byte = lines_before;
738 let end_byte = start_byte + line.len();
739
740 Span::new(
741 start_byte,
742 end_byte,
743 line_idx + 1,
744 line_idx + 1,
745 1,
746 line.len() + 1,
747 )
748 }
749
750 fn calculate_match_span(&self, match_obj: ®ex::Match, content: &str) -> Span {
752 let start_byte = match_obj.start();
753 let end_byte = match_obj.end();
754
755 let content_before = &content[..start_byte];
756 let start_line = content_before.lines().count();
757 let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
758
759 let match_content = match_obj.as_str();
760 let lines_in_match = match_content.lines().count();
761 let end_line = start_line + lines_in_match.saturating_sub(1);
762 let end_column = if lines_in_match > 1 {
763 match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
764 } else {
765 start_column + match_content.len()
766 };
767
768 Span::new(
769 start_byte,
770 end_byte,
771 start_line.max(1),
772 end_line.max(1),
773 start_column,
774 end_column,
775 )
776 }
777}
778
779impl Default for ConfigParser {
780 fn default() -> Self {
781 Self::new()
782 }
783}
784
785pub struct TextParser;
787
788impl TextParser {
789 pub fn new() -> Self {
791 Self
792 }
793
794 pub fn parse(
796 &self,
797 file_path: &Path,
798 content: &str,
799 format: DocumentFormat,
800 ) -> Result<Vec<ContentChunk>> {
801 let mut chunks = Vec::new();
802 let lines: Vec<&str> = content.lines().collect();
803
804 let mut paragraph_start = 0;
805 let mut paragraph_lines = Vec::new();
806 let mut chunk_index = 0;
807
808 for (line_idx, line) in lines.iter().enumerate() {
809 let trimmed = line.trim();
810
811 if trimmed.is_empty() {
812 if !paragraph_lines.is_empty() {
814 let paragraph_text = paragraph_lines.join("\n");
815 let span = self.calculate_paragraph_span(paragraph_start, line_idx - 1, &lines);
816
817 let chunk = ContentChunk::new(
818 file_path.to_path_buf(),
819 ContentType::Documentation {
820 format: format.clone(),
821 },
822 paragraph_text,
823 span,
824 chunk_index,
825 )
826 .with_metadata(serde_json::json!({
827 "element_type": "paragraph",
828 "line_count": paragraph_lines.len()
829 }));
830
831 chunks.push(chunk);
832 chunk_index += 1;
833 paragraph_lines.clear();
834 }
835 continue;
836 }
837
838 if paragraph_lines.is_empty() {
840 paragraph_start = line_idx;
841 }
842 paragraph_lines.push(trimmed);
843 }
844
845 if !paragraph_lines.is_empty() {
847 let paragraph_text = paragraph_lines.join("\n");
848 let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, &lines);
849
850 let chunk = ContentChunk::new(
851 file_path.to_path_buf(),
852 ContentType::Documentation { format },
853 paragraph_text,
854 span,
855 chunk_index,
856 )
857 .with_metadata(serde_json::json!({
858 "element_type": "paragraph",
859 "line_count": paragraph_lines.len()
860 }));
861
862 chunks.push(chunk);
863 }
864
865 Ok(chunks)
866 }
867
868 fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, lines: &[&str]) -> Span {
870 let start_byte: usize = lines
871 .iter()
872 .take(start_line)
873 .map(|l| l.len() + 1)
874 .sum::<usize>();
875 let end_byte: usize = lines
876 .iter()
877 .take(end_line + 1)
878 .map(|l| l.len() + 1)
879 .sum::<usize>()
880 - 1;
881
882 Span::new(
883 start_byte,
884 end_byte,
885 start_line + 1,
886 end_line + 1,
887 1,
888 lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
889 )
890 }
891}
892
893impl Default for TextParser {
894 fn default() -> Self {
895 Self::new()
896 }
897}
898
899#[cfg(test)]
900mod tests {
901 use super::*;
902
903 #[test]
904 fn test_document_parser_creation() {
905 let _parser = DocumentParser::new();
906 }
909
910 #[test]
911 fn test_content_type_detection() {
912 let parser = DocumentParser::new();
913
914 let test_cases = vec![
915 (
916 "test.md",
917 ContentType::Documentation {
918 format: DocumentFormat::Markdown,
919 },
920 ),
921 (
922 "README.markdown",
923 ContentType::Documentation {
924 format: DocumentFormat::Markdown,
925 },
926 ),
927 (
928 "doc.rst",
929 ContentType::Documentation {
930 format: DocumentFormat::RestructuredText,
931 },
932 ),
933 (
934 "manual.adoc",
935 ContentType::Documentation {
936 format: DocumentFormat::AsciiDoc,
937 },
938 ),
939 (
940 "page.html",
941 ContentType::Documentation {
942 format: DocumentFormat::Html,
943 },
944 ),
945 (
946 "notes.txt",
947 ContentType::Documentation {
948 format: DocumentFormat::PlainText,
949 },
950 ),
951 (
952 "config.json",
953 ContentType::Configuration {
954 format: ConfigFormat::Json,
955 },
956 ),
957 (
958 "config.yaml",
959 ContentType::Configuration {
960 format: ConfigFormat::Yaml,
961 },
962 ),
963 (
964 "config.yml",
965 ContentType::Configuration {
966 format: ConfigFormat::Yaml,
967 },
968 ),
969 (
970 "Cargo.toml",
971 ContentType::Configuration {
972 format: ConfigFormat::Toml,
973 },
974 ),
975 (
976 "settings.ini",
977 ContentType::Configuration {
978 format: ConfigFormat::Ini,
979 },
980 ),
981 (
982 "app.properties",
983 ContentType::Configuration {
984 format: ConfigFormat::Properties,
985 },
986 ),
987 (
988 ".env",
989 ContentType::Configuration {
990 format: ConfigFormat::Env,
991 },
992 ),
993 (
994 "config.xml",
995 ContentType::Configuration {
996 format: ConfigFormat::Xml,
997 },
998 ),
999 ("unknown.xyz", ContentType::PlainText),
1000 ];
1001
1002 for (filename, expected_type) in test_cases {
1003 let path = Path::new(filename);
1004 let detected_type = parser.detect_content_type(path).unwrap();
1005 assert_eq!(
1006 std::mem::discriminant(&detected_type),
1007 std::mem::discriminant(&expected_type),
1008 "Failed for file: {filename}"
1009 );
1010 }
1011 }
1012
1013 #[test]
1014 fn test_markdown_parser_headers() {
1015 let parser = MarkdownParser::new();
1016 let content = r#"# Main Title
1017Some content here.
1018
1019## Secondary Title
1020More content.
1021
1022### Subsection
1023Even more content.
1024
1025#### Level 4
1026Content at level 4.
1027
1028##### Level 5
1029Content at level 5.
1030
1031###### Level 6
1032Content at level 6."#;
1033
1034 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1035
1036 let headers: Vec<_> = chunks
1038 .iter()
1039 .filter(|chunk| {
1040 if let Some(metadata) = chunk.metadata.as_object() {
1041 metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1042 } else {
1043 false
1044 }
1045 })
1046 .collect();
1047
1048 assert_eq!(headers.len(), 6, "Should find 6 headers");
1049
1050 let header_levels: Vec<_> = headers
1052 .iter()
1053 .filter_map(|chunk| {
1054 chunk
1055 .metadata
1056 .as_object()
1057 .and_then(|m| m.get("header_level"))
1058 .and_then(|v| v.as_u64())
1059 })
1060 .collect();
1061
1062 assert_eq!(header_levels, vec![1, 2, 3, 4, 5, 6]);
1063 assert_eq!(headers[0].content, "Main Title");
1064 assert_eq!(headers[1].content, "Secondary Title");
1065 assert_eq!(headers[2].content, "Subsection");
1066 }
1067
1068 #[test]
1069 fn test_markdown_parser_code_blocks() {
1070 let parser = MarkdownParser::new();
1071 let content = r#"Here is some Python code:
1072
1073```python
1074def hello_world():
1075 print("Hello, World!")
1076 return "success"
1077```
1078
1079And here is some JavaScript:
1080
1081```javascript
1082function greet(name) {
1083 console.log(`Hello, ${name}!`);
1084}
1085```
1086
1087And a generic code block:
1088
1089```
1090generic code here
1091no language specified
1092```"#;
1093
1094 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1095
1096 let code_blocks: Vec<_> = chunks
1097 .iter()
1098 .filter(|chunk| {
1099 if let Some(metadata) = chunk.metadata.as_object() {
1100 metadata.get("element_type").and_then(|v| v.as_str()) == Some("code_block")
1101 } else {
1102 false
1103 }
1104 })
1105 .collect();
1106
1107 assert_eq!(code_blocks.len(), 3, "Should find 3 code blocks");
1108
1109 assert!(code_blocks[0].content.contains("def hello_world"));
1111 assert!(code_blocks[0].content.contains("print(\"Hello, World!\")"));
1112 let python_lang = code_blocks[0]
1113 .metadata
1114 .as_object()
1115 .unwrap()
1116 .get("language")
1117 .unwrap()
1118 .as_str()
1119 .unwrap();
1120 assert_eq!(python_lang, "python");
1121
1122 assert!(code_blocks[1].content.contains("function greet"));
1124 let js_lang = code_blocks[1]
1125 .metadata
1126 .as_object()
1127 .unwrap()
1128 .get("language")
1129 .unwrap()
1130 .as_str()
1131 .unwrap();
1132 assert_eq!(js_lang, "javascript");
1133
1134 assert!(code_blocks[2].content.contains("generic code here"));
1136 let generic_lang = code_blocks[2]
1137 .metadata
1138 .as_object()
1139 .unwrap()
1140 .get("language")
1141 .unwrap()
1142 .as_str()
1143 .unwrap();
1144 assert_eq!(generic_lang, "text");
1145 }
1146
1147 #[test]
1148 fn test_markdown_parser_paragraphs() {
1149 let parser = MarkdownParser::new();
1150 let content = r#"This is the first paragraph with some content.
1151It spans multiple lines.
1152
1153This is the second paragraph.
1154
1155# A Header
1156
1157This is a paragraph after a header.
1158
1159Another paragraph here."#;
1160
1161 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1162
1163 let paragraphs: Vec<_> = chunks
1164 .iter()
1165 .filter(|chunk| {
1166 if let Some(metadata) = chunk.metadata.as_object() {
1167 metadata.get("element_type").and_then(|v| v.as_str()) == Some("paragraph")
1168 } else {
1169 false
1170 }
1171 })
1172 .collect();
1173
1174 assert!(paragraphs.len() >= 3, "Should find at least 3 paragraphs");
1175 assert!(paragraphs[0].content.contains("first paragraph"));
1176 assert!(paragraphs[1].content.contains("second paragraph"));
1177 }
1178
1179 #[test]
1180 fn test_json_config_parser() {
1181 let parser = ConfigParser::new();
1182 let content = r#"{
1183 "database": {
1184 "host": "localhost",
1185 "port": 5432,
1186 "name": "myapp"
1187 },
1188 "features": ["auth", "logging", "metrics"],
1189 "debug": true,
1190 "version": "1.0.0"
1191}"#;
1192
1193 let chunks = parser
1194 .parse(Path::new("config.json"), content, ConfigFormat::Json)
1195 .unwrap();
1196
1197 assert!(!chunks.is_empty(), "Should extract chunks from JSON");
1198
1199 let string_chunks: Vec<_> = chunks
1201 .iter()
1202 .filter(|chunk| {
1203 if let Some(metadata) = chunk.metadata.as_object() {
1204 metadata.get("value_type").and_then(|v| v.as_str()) == Some("string")
1205 } else {
1206 false
1207 }
1208 })
1209 .collect();
1210
1211 let boolean_chunks: Vec<_> = chunks
1212 .iter()
1213 .filter(|chunk| {
1214 if let Some(metadata) = chunk.metadata.as_object() {
1215 metadata.get("value_type").and_then(|v| v.as_str()) == Some("boolean")
1216 } else {
1217 false
1218 }
1219 })
1220 .collect();
1221
1222 assert!(!string_chunks.is_empty(), "Should find string values");
1223 assert!(!boolean_chunks.is_empty(), "Should find boolean values");
1224 }
1225
1226 #[test]
1227 fn test_yaml_config_parser() {
1228 let parser = ConfigParser::new();
1229 let content = r#"database:
1230 host: localhost
1231 port: 5432
1232 name: myapp
1233
1234features:
1235 - auth
1236 - logging
1237 - metrics
1238
1239debug: true
1240version: "1.0.0"
1241"#;
1242
1243 let chunks = parser
1244 .parse(Path::new("config.yaml"), content, ConfigFormat::Yaml)
1245 .unwrap();
1246
1247 assert!(!chunks.is_empty(), "Should extract chunks from YAML");
1248
1249 let has_database = chunks
1251 .iter()
1252 .any(|chunk| chunk.content.contains("host: localhost"));
1253 let has_debug = chunks
1254 .iter()
1255 .any(|chunk| chunk.content.contains("debug: true"));
1256
1257 assert!(has_database, "Should find database configuration");
1258 assert!(has_debug, "Should find debug setting");
1259 }
1260
1261 #[test]
1262 fn test_toml_config_parser() {
1263 let parser = ConfigParser::new();
1264 let content = r#"[database]
1265host = "localhost"
1266port = 5432
1267name = "myapp"
1268
1269[features]
1270auth = true
1271logging = true
1272metrics = false
1273
1274debug = true
1275version = "1.0.0"
1276"#;
1277
1278 let chunks = parser
1279 .parse(Path::new("Cargo.toml"), content, ConfigFormat::Toml)
1280 .unwrap();
1281
1282 assert!(!chunks.is_empty(), "Should extract chunks from TOML");
1283
1284 let sections: Vec<_> = chunks
1286 .iter()
1287 .filter(|chunk| {
1288 if let Some(metadata) = chunk.metadata.as_object() {
1289 metadata.get("element_type").and_then(|v| v.as_str()) == Some("section")
1290 } else {
1291 false
1292 }
1293 })
1294 .collect();
1295
1296 assert!(sections.len() >= 2, "Should find at least 2 sections");
1297 assert!(sections.iter().any(|s| s.content == "database"));
1298 assert!(sections.iter().any(|s| s.content == "features"));
1299
1300 let key_values: Vec<_> = chunks
1301 .iter()
1302 .filter(|chunk| chunk.content.contains(" = "))
1303 .collect();
1304
1305 assert!(!key_values.is_empty(), "Should find key-value pairs");
1306 }
1307
1308 #[test]
1309 fn test_ini_config_parser() {
1310 let parser = ConfigParser::new();
1311 let content = r#"[database]
1312host=localhost
1313port=5432
1314name=myapp
1315
1316[logging]
1317level=info
1318file=/var/log/app.log
1319
1320debug=true
1321"#;
1322
1323 let chunks = parser
1324 .parse(Path::new("config.ini"), content, ConfigFormat::Ini)
1325 .unwrap();
1326
1327 assert!(!chunks.is_empty(), "Should extract chunks from INI");
1328
1329 let key_values: Vec<_> = chunks
1330 .iter()
1331 .filter(|chunk| chunk.content.contains("="))
1332 .collect();
1333
1334 assert!(
1335 key_values.len() >= 5,
1336 "Should find multiple key-value pairs"
1337 );
1338 assert!(key_values
1339 .iter()
1340 .any(|kv| kv.content.contains("host=localhost")));
1341 assert!(key_values
1342 .iter()
1343 .any(|kv| kv.content.contains("level=info")));
1344 }
1345
1346 #[test]
1347 fn test_properties_config_parser() {
1348 let parser = ConfigParser::new();
1349 let content = r#"# Application configuration
1350database.host=localhost
1351database.port=5432
1352database.name=myapp
1353
1354# Logging configuration
1355logging.level=info
1356logging.file=/var/log/app.log
1357
1358debug=true
1359"#;
1360
1361 let chunks = parser
1362 .parse(
1363 Path::new("app.properties"),
1364 content,
1365 ConfigFormat::Properties,
1366 )
1367 .unwrap();
1368
1369 assert!(!chunks.is_empty(), "Should extract chunks from properties");
1370
1371 let properties: Vec<_> = chunks
1372 .iter()
1373 .filter(|chunk| chunk.content.contains("="))
1374 .collect();
1375
1376 assert!(properties.len() >= 5, "Should find multiple properties");
1377 assert!(properties
1378 .iter()
1379 .any(|p| p.content.contains("database.host=localhost")));
1380 assert!(properties
1381 .iter()
1382 .any(|p| p.content.contains("logging.level=info")));
1383 }
1384
1385 #[test]
1386 fn test_env_config_parser() {
1387 let parser = ConfigParser::new();
1388 let content = r#"DATABASE_HOST=localhost
1389DATABASE_PORT=5432
1390DATABASE_NAME=myapp
1391DEBUG=true
1392SECRET_KEY=abc123xyz
1393"#;
1394
1395 let chunks = parser
1396 .parse(Path::new(".env"), content, ConfigFormat::Env)
1397 .unwrap();
1398
1399 assert!(!chunks.is_empty(), "Should extract chunks from env file");
1400
1401 let env_vars: Vec<_> = chunks
1402 .iter()
1403 .filter(|chunk| chunk.content.contains("="))
1404 .collect();
1405
1406 assert_eq!(env_vars.len(), 5, "Should find 5 environment variables");
1407 assert!(env_vars
1408 .iter()
1409 .any(|var| var.content.contains("DATABASE_HOST=localhost")));
1410 assert!(env_vars
1411 .iter()
1412 .any(|var| var.content.contains("DEBUG=true")));
1413 }
1414
1415 #[test]
1416 fn test_xml_config_parser() {
1417 let parser = ConfigParser::new();
1418 let content = r#"<configuration>
1419 <database>
1420 <host>localhost</host>
1421 <port>5432</port>
1422 <name>myapp</name>
1423 </database>
1424 <features>
1425 <auth>true</auth>
1426 <logging>true</logging>
1427 </features>
1428 <debug>true</debug>
1429</configuration>"#;
1430
1431 let chunks = parser
1432 .parse(Path::new("config.xml"), content, ConfigFormat::Xml)
1433 .unwrap();
1434
1435 assert!(!chunks.is_empty(), "Should extract chunks from XML");
1436
1437 let tag_contents: Vec<_> = chunks
1439 .iter()
1440 .filter(|chunk| !chunk.content.trim().is_empty())
1441 .collect();
1442
1443 assert!(!tag_contents.is_empty(), "Should find tag contents");
1444 assert!(tag_contents.iter().any(|tag| tag.content == "localhost"));
1445 assert!(tag_contents.iter().any(|tag| tag.content == "5432"));
1446 assert!(tag_contents.iter().any(|tag| tag.content == "true"));
1447 }
1448
1449 #[test]
1450 fn test_text_parser_paragraphs() {
1451 let parser = TextParser::new();
1452 let content = r#"This is the first paragraph.
1453It has multiple lines.
1454
1455This is the second paragraph.
1456
1457This is the third paragraph.
1458It also has multiple lines.
1459And even more lines."#;
1460
1461 let chunks = parser
1462 .parse(
1463 Path::new("document.txt"),
1464 content,
1465 DocumentFormat::PlainText,
1466 )
1467 .unwrap();
1468
1469 assert_eq!(chunks.len(), 3, "Should find 3 paragraphs");
1470
1471 assert!(chunks[0].content.contains("first paragraph"));
1472 assert!(chunks[1].content.contains("second paragraph"));
1473 assert!(chunks[2].content.contains("third paragraph"));
1474
1475 for chunk in &chunks {
1477 let metadata = chunk.metadata.as_object().unwrap();
1478 assert_eq!(
1479 metadata.get("element_type").unwrap().as_str().unwrap(),
1480 "paragraph"
1481 );
1482 assert!(metadata.get("line_count").unwrap().as_u64().unwrap() >= 1);
1483 }
1484 }
1485
1486 #[test]
1487 fn test_invalid_json_handling() {
1488 let parser = ConfigParser::new();
1489 let invalid_json = r#"{ invalid json content here"#;
1490
1491 let chunks = parser
1492 .parse(Path::new("bad.json"), invalid_json, ConfigFormat::Json)
1493 .unwrap();
1494
1495 assert_eq!(
1496 chunks.len(),
1497 1,
1498 "Should create a single chunk for invalid JSON"
1499 );
1500 assert_eq!(chunks[0].content, invalid_json);
1501
1502 let metadata = chunks[0].metadata.as_object().unwrap();
1503 assert!(metadata.get("parse_error").unwrap().as_bool().unwrap());
1504 assert_eq!(
1505 metadata.get("config_type").unwrap().as_str().unwrap(),
1506 "json"
1507 );
1508 }
1509
1510 #[test]
1511 fn test_empty_content_handling() {
1512 let parser = DocumentParser::new();
1513
1514 let empty_md = "";
1515 let node = parser.parse_file(Path::new("empty.md"), empty_md).unwrap();
1516
1517 assert_eq!(
1518 node.chunks.len(),
1519 0,
1520 "Empty content should produce no chunks"
1521 );
1522 assert_eq!(node.file_size, 0);
1523 }
1524
1525 #[test]
1526 fn test_large_content_handling() {
1527 let parser = DocumentParser::new();
1528
1529 let mut content = String::new();
1531 for i in 0..100 {
1532 content.push_str(&format!(
1533 "# Header {i}\n\nThis is paragraph {i} with some content.\n\n"
1534 ));
1535 }
1536
1537 let node = parser.parse_file(Path::new("large.md"), &content).unwrap();
1538
1539 assert!(node.chunks.len() >= 100, "Should handle large content");
1540 assert_eq!(node.file_size, content.len());
1541
1542 let headers = node
1544 .chunks
1545 .iter()
1546 .filter(|chunk| {
1547 if let Some(metadata) = chunk.metadata.as_object() {
1548 metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1549 } else {
1550 false
1551 }
1552 })
1553 .count();
1554
1555 assert!(headers >= 100, "Should find many headers");
1556 }
1557
1558 #[test]
1559 fn test_content_span_calculation() {
1560 let parser = MarkdownParser::new();
1561 let content = "# Title\nSome content.";
1562
1563 let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1564
1565 for chunk in chunks {
1566 assert!(
1567 chunk.span.start_byte < chunk.span.end_byte,
1568 "Start should be before end"
1569 );
1570 assert!(
1571 chunk.span.start_line <= chunk.span.end_line,
1572 "Start line should be <= end line"
1573 );
1574 assert!(chunk.span.start_column >= 1, "Column should be 1-indexed");
1575 assert!(
1576 chunk.span.end_byte <= content.len(),
1577 "End should not exceed content length"
1578 );
1579 }
1580 }
1581}