codeprism_core/content/
parsers.rs

1//! Content parsers for documentation and configuration files
2//!
3//! This module provides parsers for various non-code file formats including
4//! markdown, configuration files, and plain text documents.
5
6use super::{ConfigFormat, ContentChunk, ContentNode, ContentType, DocumentFormat};
7use crate::ast::Span;
8use anyhow::{anyhow, Result};
9use regex::Regex;
10use serde_json::Value;
11use std::path::Path;
12
13/// Document parser for various file formats
14pub struct DocumentParser {
15    /// Markdown parser
16    markdown_parser: MarkdownParser,
17    /// Configuration file parser
18    config_parser: ConfigParser,
19    /// Plain text parser
20    text_parser: TextParser,
21}
22
23impl DocumentParser {
24    /// Create a new document parser
25    pub fn new() -> Self {
26        Self {
27            markdown_parser: MarkdownParser::new(),
28            config_parser: ConfigParser::new(),
29            text_parser: TextParser::new(),
30        }
31    }
32
33    /// Parse a file based on its extension
34    pub fn parse_file(&self, file_path: &Path, content: &str) -> Result<ContentNode> {
35        let content_type = self.detect_content_type(file_path)?;
36        let mut node = ContentNode::new(file_path.to_path_buf(), content_type.clone());
37
38        let chunks = match content_type {
39            ContentType::Documentation { format } => match format {
40                DocumentFormat::Markdown => self.markdown_parser.parse(file_path, content)?,
41                DocumentFormat::PlainText
42                | DocumentFormat::RestructuredText
43                | DocumentFormat::AsciiDoc
44                | DocumentFormat::Html => self.text_parser.parse(file_path, content, format)?,
45            },
46            ContentType::Configuration { format } => {
47                self.config_parser.parse(file_path, content, format)?
48            }
49            ContentType::PlainText => {
50                self.text_parser
51                    .parse(file_path, content, DocumentFormat::PlainText)?
52            }
53            _ => return Err(anyhow!("Unsupported content type for document parser")),
54        };
55
56        for chunk in chunks {
57            node.add_chunk(chunk);
58        }
59        node.file_size = content.len();
60
61        Ok(node)
62    }
63
64    /// Detect content type from file extension
65    fn detect_content_type(&self, file_path: &Path) -> Result<ContentType> {
66        // Handle special files without extensions first
67        if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
68            if file_name == ".env" {
69                return Ok(ContentType::Configuration {
70                    format: ConfigFormat::Env,
71                });
72            }
73        }
74
75        let extension = file_path
76            .extension()
77            .and_then(|ext| ext.to_str())
78            .unwrap_or("")
79            .to_lowercase();
80
81        match extension.as_str() {
82            "md" | "markdown" => Ok(ContentType::Documentation {
83                format: DocumentFormat::Markdown,
84            }),
85            "rst" => Ok(ContentType::Documentation {
86                format: DocumentFormat::RestructuredText,
87            }),
88            "adoc" | "asciidoc" => Ok(ContentType::Documentation {
89                format: DocumentFormat::AsciiDoc,
90            }),
91            "html" | "htm" => Ok(ContentType::Documentation {
92                format: DocumentFormat::Html,
93            }),
94            "txt" | "text" => Ok(ContentType::Documentation {
95                format: DocumentFormat::PlainText,
96            }),
97            "json" => Ok(ContentType::Configuration {
98                format: ConfigFormat::Json,
99            }),
100            "yaml" | "yml" => Ok(ContentType::Configuration {
101                format: ConfigFormat::Yaml,
102            }),
103            "toml" => Ok(ContentType::Configuration {
104                format: ConfigFormat::Toml,
105            }),
106            "ini" => Ok(ContentType::Configuration {
107                format: ConfigFormat::Ini,
108            }),
109            "properties" => Ok(ContentType::Configuration {
110                format: ConfigFormat::Properties,
111            }),
112            "env" => Ok(ContentType::Configuration {
113                format: ConfigFormat::Env,
114            }),
115            "xml" => Ok(ContentType::Configuration {
116                format: ConfigFormat::Xml,
117            }),
118            _ => Ok(ContentType::PlainText),
119        }
120    }
121}
122
123impl Default for DocumentParser {
124    fn default() -> Self {
125        Self::new()
126    }
127}
128
129/// Markdown document parser
130pub struct MarkdownParser {
131    /// Regex for markdown headers
132    header_regex: Regex,
133    /// Regex for code blocks
134    code_block_regex: Regex,
135    /// Regex for inline code
136    #[allow(dead_code)]
137    inline_code_regex: Regex,
138    /// Regex for links
139    #[allow(dead_code)]
140    link_regex: Regex,
141    /// Regex for lists
142    #[allow(dead_code)]
143    list_regex: Regex,
144}
145
146impl MarkdownParser {
147    /// Create a new markdown parser
148    pub fn new() -> Self {
149        Self {
150            header_regex: Regex::new(r"(?m)^(#{1,6})\s+(.+)$").unwrap(),
151            code_block_regex: Regex::new(r"```(\w+)?\n([\s\S]*?)\n```").unwrap(),
152            inline_code_regex: Regex::new(r"`([^`]+)`").unwrap(),
153            link_regex: Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap(),
154            list_regex: Regex::new(r"(?m)^[\s]*[-*+]\s+(.+)$").unwrap(),
155        }
156    }
157
158    /// Parse markdown content into chunks
159    pub fn parse(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
160        let mut chunks = Vec::new();
161        let lines: Vec<&str> = content.lines().collect();
162        let mut _current_line = 0;
163        let mut chunk_index = 0;
164
165        // Parse headers
166        for (line_idx, line) in lines.iter().enumerate() {
167            if let Some(captures) = self.header_regex.captures(line) {
168                let level = captures.get(1).unwrap().as_str().len();
169                let header_text = captures.get(2).unwrap().as_str();
170
171                let span = self.calculate_line_span(line_idx, line, content);
172                let chunk = ContentChunk::new(
173                    file_path.to_path_buf(),
174                    ContentType::Documentation {
175                        format: DocumentFormat::Markdown,
176                    },
177                    header_text.to_string(),
178                    span,
179                    chunk_index,
180                )
181                .with_metadata(serde_json::json!({
182                    "header_level": level,
183                    "element_type": "header"
184                }));
185
186                chunks.push(chunk);
187                chunk_index += 1;
188            }
189        }
190
191        // Parse code blocks
192        for captures in self.code_block_regex.captures_iter(content) {
193            let language = captures.get(1).map(|m| m.as_str()).unwrap_or("text");
194            let code_content = captures.get(2).unwrap().as_str();
195            let full_match = captures.get(0).unwrap();
196
197            let span = self.calculate_match_span(&full_match, content);
198            let chunk = ContentChunk::new(
199                file_path.to_path_buf(),
200                ContentType::Documentation {
201                    format: DocumentFormat::Markdown,
202                },
203                code_content.to_string(),
204                span,
205                chunk_index,
206            )
207            .with_metadata(serde_json::json!({
208                "language": language,
209                "element_type": "code_block"
210            }));
211
212            chunks.push(chunk);
213            chunk_index += 1;
214        }
215
216        // Parse regular paragraphs (non-header, non-code block content)
217        let mut paragraph_start = 0;
218        let mut in_paragraph = false;
219        let mut paragraph_lines = Vec::new();
220
221        for (line_idx, line) in lines.iter().enumerate() {
222            let line_trimmed = line.trim();
223
224            // Skip headers and lines that are part of code blocks
225            if self.header_regex.is_match(line)
226                || line_trimmed.starts_with("```")
227                || line_trimmed.is_empty()
228            {
229                // End current paragraph if we have one
230                if in_paragraph && !paragraph_lines.is_empty() {
231                    let paragraph_text = paragraph_lines.join("\n");
232                    let span =
233                        self.calculate_paragraph_span(paragraph_start, line_idx - 1, content);
234
235                    let chunk = ContentChunk::new(
236                        file_path.to_path_buf(),
237                        ContentType::Documentation {
238                            format: DocumentFormat::Markdown,
239                        },
240                        paragraph_text,
241                        span,
242                        chunk_index,
243                    )
244                    .with_metadata(serde_json::json!({
245                        "element_type": "paragraph"
246                    }));
247
248                    chunks.push(chunk);
249                    chunk_index += 1;
250                }
251
252                in_paragraph = false;
253                paragraph_lines.clear();
254                continue;
255            }
256
257            // Start or continue paragraph
258            if !in_paragraph {
259                in_paragraph = true;
260                paragraph_start = line_idx;
261            }
262            paragraph_lines.push(line_trimmed);
263        }
264
265        // Handle final paragraph
266        if in_paragraph && !paragraph_lines.is_empty() {
267            let paragraph_text = paragraph_lines.join("\n");
268            let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, content);
269
270            let chunk = ContentChunk::new(
271                file_path.to_path_buf(),
272                ContentType::Documentation {
273                    format: DocumentFormat::Markdown,
274                },
275                paragraph_text,
276                span,
277                chunk_index,
278            )
279            .with_metadata(serde_json::json!({
280                "element_type": "paragraph"
281            }));
282
283            chunks.push(chunk);
284        }
285
286        Ok(chunks)
287    }
288
289    /// Calculate span for a single line
290    fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
291        let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
292        let start_byte = lines_before;
293        let end_byte = start_byte + line.len();
294
295        Span::new(
296            start_byte,
297            end_byte,
298            line_idx + 1,
299            line_idx + 1,
300            1,
301            line.len() + 1,
302        )
303    }
304
305    /// Calculate span for a regex match
306    fn calculate_match_span(&self, match_obj: &regex::Match, content: &str) -> Span {
307        let start_byte = match_obj.start();
308        let end_byte = match_obj.end();
309
310        // Count lines up to start
311        let content_before = &content[..start_byte];
312        let start_line = content_before.lines().count();
313        let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
314
315        // Count lines in match
316        let match_content = match_obj.as_str();
317        let lines_in_match = match_content.lines().count();
318        let end_line = start_line + lines_in_match.saturating_sub(1);
319        let end_column = if lines_in_match > 1 {
320            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
321        } else {
322            start_column + match_content.len()
323        };
324
325        Span::new(
326            start_byte,
327            end_byte,
328            start_line.max(1),
329            end_line.max(1),
330            start_column,
331            end_column,
332        )
333    }
334
335    /// Calculate span for a paragraph
336    fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, content: &str) -> Span {
337        let lines: Vec<&str> = content.lines().collect();
338        let start_byte: usize = lines
339            .iter()
340            .take(start_line)
341            .map(|l| l.len() + 1)
342            .sum::<usize>();
343        let end_byte: usize = lines
344            .iter()
345            .take(end_line + 1)
346            .map(|l| l.len() + 1)
347            .sum::<usize>()
348            - 1;
349
350        Span::new(
351            start_byte,
352            end_byte,
353            start_line + 1,
354            end_line + 1,
355            1,
356            lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
357        )
358    }
359}
360
361impl Default for MarkdownParser {
362    fn default() -> Self {
363        Self::new()
364    }
365}
366
367/// Configuration file parser
368pub struct ConfigParser;
369
370impl ConfigParser {
371    /// Create a new configuration parser
372    pub fn new() -> Self {
373        Self
374    }
375
376    /// Parse configuration file content
377    pub fn parse(
378        &self,
379        file_path: &Path,
380        content: &str,
381        format: ConfigFormat,
382    ) -> Result<Vec<ContentChunk>> {
383        match format {
384            ConfigFormat::Json => self.parse_json(file_path, content),
385            ConfigFormat::Yaml => self.parse_yaml(file_path, content),
386            ConfigFormat::Toml => self.parse_toml(file_path, content),
387            ConfigFormat::Ini => self.parse_ini(file_path, content),
388            ConfigFormat::Properties => self.parse_properties(file_path, content),
389            ConfigFormat::Env => self.parse_env(file_path, content),
390            ConfigFormat::Xml => self.parse_xml(file_path, content),
391        }
392    }
393
394    /// Parse JSON configuration
395    fn parse_json(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
396        let mut chunks = Vec::new();
397
398        // Try to parse as JSON to validate structure
399        match serde_json::from_str::<Value>(content) {
400            Ok(value) => {
401                // Extract key-value pairs and create chunks
402                self.extract_json_values(&value, file_path, content, &mut chunks, 0, "");
403            }
404            Err(_) => {
405                // If JSON is invalid, treat as plain text
406                chunks.push(
407                    ContentChunk::new(
408                        file_path.to_path_buf(),
409                        ContentType::Configuration {
410                            format: ConfigFormat::Json,
411                        },
412                        content.to_string(),
413                        Span::new(
414                            0,
415                            content.len(),
416                            1,
417                            content.lines().count(),
418                            1,
419                            content.lines().last().map(|l| l.len()).unwrap_or(0),
420                        ),
421                        0,
422                    )
423                    .with_metadata(serde_json::json!({
424                        "parse_error": true,
425                        "config_type": "json"
426                    })),
427                );
428            }
429        }
430
431        Ok(chunks)
432    }
433
434    /// Extract values from JSON recursively
435    #[allow(clippy::only_used_in_recursion)]
436    fn extract_json_values(
437        &self,
438        value: &Value,
439        file_path: &Path,
440        content: &str,
441        chunks: &mut Vec<ContentChunk>,
442        chunk_index: usize,
443        key_path: &str,
444    ) {
445        match value {
446            Value::Object(map) => {
447                for (key, val) in map {
448                    let new_path = if key_path.is_empty() {
449                        key.clone()
450                    } else {
451                        format!("{}.{}", key_path, key)
452                    };
453                    self.extract_json_values(
454                        val,
455                        file_path,
456                        content,
457                        chunks,
458                        chunks.len(),
459                        &new_path,
460                    );
461                }
462            }
463            Value::Array(arr) => {
464                for (index, val) in arr.iter().enumerate() {
465                    let new_path = format!("{}[{}]", key_path, index);
466                    self.extract_json_values(
467                        val,
468                        file_path,
469                        content,
470                        chunks,
471                        chunks.len(),
472                        &new_path,
473                    );
474                }
475            }
476            Value::String(_) | Value::Number(_) | Value::Bool(_) => {
477                // Create a chunk for this key-value pair
478                let value_str = match value {
479                    Value::String(s) => s.clone(),
480                    _ => value.to_string(),
481                };
482
483                // Include key in the searchable content
484                let searchable_content = if key_path.is_empty() {
485                    value_str.clone()
486                } else {
487                    format!("{}: {}", key_path, value_str)
488                };
489
490                // Try to find the approximate location in the original content
491                if let Some(position) = content.find(&value_str) {
492                    let lines_before = content[..position].lines().count();
493                    let line_start = content[..position].rfind('\n').map(|i| i + 1).unwrap_or(0);
494                    let column = position - line_start + 1;
495
496                    let span = Span::new(
497                        position,
498                        position + value_str.len(),
499                        lines_before.max(1),
500                        lines_before.max(1),
501                        column,
502                        column + value_str.len(),
503                    );
504
505                    let chunk = ContentChunk::new(
506                        file_path.to_path_buf(),
507                        ContentType::Configuration {
508                            format: ConfigFormat::Json,
509                        },
510                        searchable_content,
511                        span,
512                        chunk_index,
513                    )
514                    .with_metadata(serde_json::json!({
515                        "key_path": key_path,
516                        "value": value_str,
517                        "value_type": match value {
518                            Value::String(_) => "string",
519                            Value::Number(_) => "number",
520                            Value::Bool(_) => "boolean",
521                            _ => "unknown"
522                        },
523                        "config_type": "json"
524                    }));
525
526                    chunks.push(chunk);
527                }
528            }
529            Value::Null => {} // Skip null values
530        }
531    }
532
533    /// Parse YAML configuration (simplified)
534    fn parse_yaml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
535        // Simple line-by-line parsing for YAML
536        let mut chunks = Vec::new();
537        let lines: Vec<&str> = content.lines().collect();
538
539        for (line_idx, line) in lines.iter().enumerate() {
540            let trimmed = line.trim();
541            if trimmed.is_empty() || trimmed.starts_with('#') {
542                continue;
543            }
544
545            // Look for key-value pairs
546            if let Some(colon_pos) = trimmed.find(':') {
547                let key = trimmed[..colon_pos].trim();
548                let value = trimmed[colon_pos + 1..].trim();
549
550                if !value.is_empty() {
551                    let span = self.calculate_line_span(line_idx, line, content);
552                    let chunk = ContentChunk::new(
553                        file_path.to_path_buf(),
554                        ContentType::Configuration {
555                            format: ConfigFormat::Yaml,
556                        },
557                        format!("{}: {}", key, value),
558                        span,
559                        chunks.len(),
560                    )
561                    .with_metadata(serde_json::json!({
562                        "key": key,
563                        "value": value,
564                        "config_type": "yaml"
565                    }));
566
567                    chunks.push(chunk);
568                }
569            }
570        }
571
572        Ok(chunks)
573    }
574
575    /// Parse TOML configuration (simplified)
576    fn parse_toml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
577        // Similar to YAML but with different syntax
578        let mut chunks = Vec::new();
579        let lines: Vec<&str> = content.lines().collect();
580
581        for (line_idx, line) in lines.iter().enumerate() {
582            let trimmed = line.trim();
583            if trimmed.is_empty() || trimmed.starts_with('#') {
584                continue;
585            }
586
587            // Handle sections
588            if trimmed.starts_with('[') && trimmed.ends_with(']') {
589                let section = &trimmed[1..trimmed.len() - 1];
590                let span = self.calculate_line_span(line_idx, line, content);
591                let chunk = ContentChunk::new(
592                    file_path.to_path_buf(),
593                    ContentType::Configuration {
594                        format: ConfigFormat::Toml,
595                    },
596                    section.to_string(),
597                    span,
598                    chunks.len(),
599                )
600                .with_metadata(serde_json::json!({
601                    "element_type": "section",
602                    "section_name": section,
603                    "config_type": "toml"
604                }));
605
606                chunks.push(chunk);
607                continue;
608            }
609
610            // Handle key-value pairs
611            if let Some(eq_pos) = trimmed.find('=') {
612                let key = trimmed[..eq_pos].trim();
613                let value = trimmed[eq_pos + 1..].trim();
614
615                let span = self.calculate_line_span(line_idx, line, content);
616                let chunk = ContentChunk::new(
617                    file_path.to_path_buf(),
618                    ContentType::Configuration {
619                        format: ConfigFormat::Toml,
620                    },
621                    format!("{} = {}", key, value),
622                    span,
623                    chunks.len(),
624                )
625                .with_metadata(serde_json::json!({
626                    "key": key,
627                    "value": value,
628                    "config_type": "toml"
629                }));
630
631                chunks.push(chunk);
632            }
633        }
634
635        Ok(chunks)
636    }
637
638    /// Parse INI configuration
639    fn parse_ini(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
640        // Similar pattern to TOML
641        self.parse_key_value_format(file_path, content, ConfigFormat::Ini, "ini")
642    }
643
644    /// Parse properties configuration
645    fn parse_properties(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
646        self.parse_key_value_format(file_path, content, ConfigFormat::Properties, "properties")
647    }
648
649    /// Parse environment file
650    fn parse_env(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
651        self.parse_key_value_format(file_path, content, ConfigFormat::Env, "env")
652    }
653
654    /// Parse XML configuration (simplified)
655    fn parse_xml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
656        // Simple XML tag extraction without backreferences
657        let tag_regex = Regex::new(r"<([^/>]+)>([^<]+)</[^>]+>").unwrap();
658        let mut chunks = Vec::new();
659
660        for (idx, captures) in tag_regex.captures_iter(content).enumerate() {
661            let tag_name = captures.get(1).unwrap().as_str();
662            let tag_content = captures.get(2).unwrap().as_str().trim();
663
664            if !tag_content.is_empty() {
665                let full_match = captures.get(0).unwrap();
666                let span = self.calculate_match_span(&full_match, content);
667
668                let chunk = ContentChunk::new(
669                    file_path.to_path_buf(),
670                    ContentType::Configuration {
671                        format: ConfigFormat::Xml,
672                    },
673                    tag_content.to_string(),
674                    span,
675                    idx,
676                )
677                .with_metadata(serde_json::json!({
678                    "tag_name": tag_name,
679                    "config_type": "xml"
680                }));
681
682                chunks.push(chunk);
683            }
684        }
685
686        Ok(chunks)
687    }
688
689    /// Generic key-value format parser
690    fn parse_key_value_format(
691        &self,
692        file_path: &Path,
693        content: &str,
694        format: ConfigFormat,
695        format_name: &str,
696    ) -> Result<Vec<ContentChunk>> {
697        let mut chunks = Vec::new();
698        let lines: Vec<&str> = content.lines().collect();
699
700        for (line_idx, line) in lines.iter().enumerate() {
701            let trimmed = line.trim();
702            if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with(';') {
703                continue;
704            }
705
706            // Look for key=value pattern
707            if let Some(eq_pos) = trimmed.find('=') {
708                let key = trimmed[..eq_pos].trim();
709                let value = trimmed[eq_pos + 1..].trim();
710
711                let span = self.calculate_line_span(line_idx, line, content);
712                let chunk = ContentChunk::new(
713                    file_path.to_path_buf(),
714                    ContentType::Configuration {
715                        format: format.clone(),
716                    },
717                    format!("{}={}", key, value),
718                    span,
719                    chunks.len(),
720                )
721                .with_metadata(serde_json::json!({
722                    "key": key,
723                    "value": value,
724                    "config_type": format_name
725                }));
726
727                chunks.push(chunk);
728            }
729        }
730
731        Ok(chunks)
732    }
733
734    /// Calculate span for a line
735    fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
736        let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
737        let start_byte = lines_before;
738        let end_byte = start_byte + line.len();
739
740        Span::new(
741            start_byte,
742            end_byte,
743            line_idx + 1,
744            line_idx + 1,
745            1,
746            line.len() + 1,
747        )
748    }
749
750    /// Calculate span for a regex match
751    fn calculate_match_span(&self, match_obj: &regex::Match, content: &str) -> Span {
752        let start_byte = match_obj.start();
753        let end_byte = match_obj.end();
754
755        let content_before = &content[..start_byte];
756        let start_line = content_before.lines().count();
757        let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
758
759        let match_content = match_obj.as_str();
760        let lines_in_match = match_content.lines().count();
761        let end_line = start_line + lines_in_match.saturating_sub(1);
762        let end_column = if lines_in_match > 1 {
763            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
764        } else {
765            start_column + match_content.len()
766        };
767
768        Span::new(
769            start_byte,
770            end_byte,
771            start_line.max(1),
772            end_line.max(1),
773            start_column,
774            end_column,
775        )
776    }
777}
778
779impl Default for ConfigParser {
780    fn default() -> Self {
781        Self::new()
782    }
783}
784
785/// Plain text parser
786pub struct TextParser;
787
788impl TextParser {
789    /// Create a new text parser
790    pub fn new() -> Self {
791        Self
792    }
793
794    /// Parse plain text into chunks (paragraph-based)
795    pub fn parse(
796        &self,
797        file_path: &Path,
798        content: &str,
799        format: DocumentFormat,
800    ) -> Result<Vec<ContentChunk>> {
801        let mut chunks = Vec::new();
802        let lines: Vec<&str> = content.lines().collect();
803
804        let mut paragraph_start = 0;
805        let mut paragraph_lines = Vec::new();
806        let mut chunk_index = 0;
807
808        for (line_idx, line) in lines.iter().enumerate() {
809            let trimmed = line.trim();
810
811            if trimmed.is_empty() {
812                // End current paragraph
813                if !paragraph_lines.is_empty() {
814                    let paragraph_text = paragraph_lines.join("\n");
815                    let span = self.calculate_paragraph_span(paragraph_start, line_idx - 1, &lines);
816
817                    let chunk = ContentChunk::new(
818                        file_path.to_path_buf(),
819                        ContentType::Documentation {
820                            format: format.clone(),
821                        },
822                        paragraph_text,
823                        span,
824                        chunk_index,
825                    )
826                    .with_metadata(serde_json::json!({
827                        "element_type": "paragraph",
828                        "line_count": paragraph_lines.len()
829                    }));
830
831                    chunks.push(chunk);
832                    chunk_index += 1;
833                    paragraph_lines.clear();
834                }
835                continue;
836            }
837
838            // Start new paragraph or continue existing one
839            if paragraph_lines.is_empty() {
840                paragraph_start = line_idx;
841            }
842            paragraph_lines.push(trimmed);
843        }
844
845        // Handle final paragraph
846        if !paragraph_lines.is_empty() {
847            let paragraph_text = paragraph_lines.join("\n");
848            let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, &lines);
849
850            let chunk = ContentChunk::new(
851                file_path.to_path_buf(),
852                ContentType::Documentation { format },
853                paragraph_text,
854                span,
855                chunk_index,
856            )
857            .with_metadata(serde_json::json!({
858                "element_type": "paragraph",
859                "line_count": paragraph_lines.len()
860            }));
861
862            chunks.push(chunk);
863        }
864
865        Ok(chunks)
866    }
867
868    /// Calculate span for a paragraph
869    fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, lines: &[&str]) -> Span {
870        let start_byte: usize = lines
871            .iter()
872            .take(start_line)
873            .map(|l| l.len() + 1)
874            .sum::<usize>();
875        let end_byte: usize = lines
876            .iter()
877            .take(end_line + 1)
878            .map(|l| l.len() + 1)
879            .sum::<usize>()
880            - 1;
881
882        Span::new(
883            start_byte,
884            end_byte,
885            start_line + 1,
886            end_line + 1,
887            1,
888            lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
889        )
890    }
891}
892
893impl Default for TextParser {
894    fn default() -> Self {
895        Self::new()
896    }
897}
898
899#[cfg(test)]
900mod tests {
901    use super::*;
902
903    #[test]
904    fn test_document_parser_creation() {
905        let parser = DocumentParser::new();
906        // Just test that creation doesn't panic
907        assert!(true);
908    }
909
910    #[test]
911    fn test_content_type_detection() {
912        let parser = DocumentParser::new();
913
914        let test_cases = vec![
915            (
916                "test.md",
917                ContentType::Documentation {
918                    format: DocumentFormat::Markdown,
919                },
920            ),
921            (
922                "README.markdown",
923                ContentType::Documentation {
924                    format: DocumentFormat::Markdown,
925                },
926            ),
927            (
928                "doc.rst",
929                ContentType::Documentation {
930                    format: DocumentFormat::RestructuredText,
931                },
932            ),
933            (
934                "manual.adoc",
935                ContentType::Documentation {
936                    format: DocumentFormat::AsciiDoc,
937                },
938            ),
939            (
940                "page.html",
941                ContentType::Documentation {
942                    format: DocumentFormat::Html,
943                },
944            ),
945            (
946                "notes.txt",
947                ContentType::Documentation {
948                    format: DocumentFormat::PlainText,
949                },
950            ),
951            (
952                "config.json",
953                ContentType::Configuration {
954                    format: ConfigFormat::Json,
955                },
956            ),
957            (
958                "config.yaml",
959                ContentType::Configuration {
960                    format: ConfigFormat::Yaml,
961                },
962            ),
963            (
964                "config.yml",
965                ContentType::Configuration {
966                    format: ConfigFormat::Yaml,
967                },
968            ),
969            (
970                "Cargo.toml",
971                ContentType::Configuration {
972                    format: ConfigFormat::Toml,
973                },
974            ),
975            (
976                "settings.ini",
977                ContentType::Configuration {
978                    format: ConfigFormat::Ini,
979                },
980            ),
981            (
982                "app.properties",
983                ContentType::Configuration {
984                    format: ConfigFormat::Properties,
985                },
986            ),
987            (
988                ".env",
989                ContentType::Configuration {
990                    format: ConfigFormat::Env,
991                },
992            ),
993            (
994                "config.xml",
995                ContentType::Configuration {
996                    format: ConfigFormat::Xml,
997                },
998            ),
999            ("unknown.xyz", ContentType::PlainText),
1000        ];
1001
1002        for (filename, expected_type) in test_cases {
1003            let path = Path::new(filename);
1004            let detected_type = parser.detect_content_type(path).unwrap();
1005            assert_eq!(
1006                std::mem::discriminant(&detected_type),
1007                std::mem::discriminant(&expected_type),
1008                "Failed for file: {}",
1009                filename
1010            );
1011        }
1012    }
1013
1014    #[test]
1015    fn test_markdown_parser_headers() {
1016        let parser = MarkdownParser::new();
1017        let content = r#"# Main Title
1018Some content here.
1019
1020## Secondary Title
1021More content.
1022
1023### Subsection
1024Even more content.
1025
1026#### Level 4
1027Content at level 4.
1028
1029##### Level 5
1030Content at level 5.
1031
1032###### Level 6
1033Content at level 6."#;
1034
1035        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1036
1037        // Should extract all headers
1038        let headers: Vec<_> = chunks
1039            .iter()
1040            .filter(|chunk| {
1041                if let Some(metadata) = chunk.metadata.as_object() {
1042                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1043                } else {
1044                    false
1045                }
1046            })
1047            .collect();
1048
1049        assert_eq!(headers.len(), 6, "Should find 6 headers");
1050
1051        // Test header levels
1052        let header_levels: Vec<_> = headers
1053            .iter()
1054            .filter_map(|chunk| {
1055                chunk
1056                    .metadata
1057                    .as_object()
1058                    .and_then(|m| m.get("header_level"))
1059                    .and_then(|v| v.as_u64())
1060            })
1061            .collect();
1062
1063        assert_eq!(header_levels, vec![1, 2, 3, 4, 5, 6]);
1064        assert_eq!(headers[0].content, "Main Title");
1065        assert_eq!(headers[1].content, "Secondary Title");
1066        assert_eq!(headers[2].content, "Subsection");
1067    }
1068
1069    #[test]
1070    fn test_markdown_parser_code_blocks() {
1071        let parser = MarkdownParser::new();
1072        let content = r#"Here is some Python code:
1073
1074```python
1075def hello_world():
1076    print("Hello, World!")
1077    return "success"
1078```
1079
1080And here is some JavaScript:
1081
1082```javascript
1083function greet(name) {
1084    console.log(`Hello, ${name}!`);
1085}
1086```
1087
1088And a generic code block:
1089
1090```
1091generic code here
1092no language specified
1093```"#;
1094
1095        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1096
1097        let code_blocks: Vec<_> = chunks
1098            .iter()
1099            .filter(|chunk| {
1100                if let Some(metadata) = chunk.metadata.as_object() {
1101                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("code_block")
1102                } else {
1103                    false
1104                }
1105            })
1106            .collect();
1107
1108        assert_eq!(code_blocks.len(), 3, "Should find 3 code blocks");
1109
1110        // Test Python code block
1111        assert!(code_blocks[0].content.contains("def hello_world"));
1112        assert!(code_blocks[0].content.contains("print(\"Hello, World!\")"));
1113        let python_lang = code_blocks[0]
1114            .metadata
1115            .as_object()
1116            .unwrap()
1117            .get("language")
1118            .unwrap()
1119            .as_str()
1120            .unwrap();
1121        assert_eq!(python_lang, "python");
1122
1123        // Test JavaScript code block
1124        assert!(code_blocks[1].content.contains("function greet"));
1125        let js_lang = code_blocks[1]
1126            .metadata
1127            .as_object()
1128            .unwrap()
1129            .get("language")
1130            .unwrap()
1131            .as_str()
1132            .unwrap();
1133        assert_eq!(js_lang, "javascript");
1134
1135        // Test generic code block
1136        assert!(code_blocks[2].content.contains("generic code here"));
1137        let generic_lang = code_blocks[2]
1138            .metadata
1139            .as_object()
1140            .unwrap()
1141            .get("language")
1142            .unwrap()
1143            .as_str()
1144            .unwrap();
1145        assert_eq!(generic_lang, "text");
1146    }
1147
1148    #[test]
1149    fn test_markdown_parser_paragraphs() {
1150        let parser = MarkdownParser::new();
1151        let content = r#"This is the first paragraph with some content.
1152It spans multiple lines.
1153
1154This is the second paragraph.
1155
1156# A Header
1157
1158This is a paragraph after a header.
1159
1160Another paragraph here."#;
1161
1162        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1163
1164        let paragraphs: Vec<_> = chunks
1165            .iter()
1166            .filter(|chunk| {
1167                if let Some(metadata) = chunk.metadata.as_object() {
1168                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("paragraph")
1169                } else {
1170                    false
1171                }
1172            })
1173            .collect();
1174
1175        assert!(paragraphs.len() >= 3, "Should find at least 3 paragraphs");
1176        assert!(paragraphs[0].content.contains("first paragraph"));
1177        assert!(paragraphs[1].content.contains("second paragraph"));
1178    }
1179
1180    #[test]
1181    fn test_json_config_parser() {
1182        let parser = ConfigParser::new();
1183        let content = r#"{
1184  "database": {
1185    "host": "localhost",
1186    "port": 5432,
1187    "name": "myapp"
1188  },
1189  "features": ["auth", "logging", "metrics"],
1190  "debug": true,
1191  "version": "1.0.0"
1192}"#;
1193
1194        let chunks = parser
1195            .parse(Path::new("config.json"), content, ConfigFormat::Json)
1196            .unwrap();
1197
1198        assert!(!chunks.is_empty(), "Should extract chunks from JSON");
1199
1200        // Should find various value types
1201        let string_chunks: Vec<_> = chunks
1202            .iter()
1203            .filter(|chunk| {
1204                if let Some(metadata) = chunk.metadata.as_object() {
1205                    metadata.get("value_type").and_then(|v| v.as_str()) == Some("string")
1206                } else {
1207                    false
1208                }
1209            })
1210            .collect();
1211
1212        let boolean_chunks: Vec<_> = chunks
1213            .iter()
1214            .filter(|chunk| {
1215                if let Some(metadata) = chunk.metadata.as_object() {
1216                    metadata.get("value_type").and_then(|v| v.as_str()) == Some("boolean")
1217                } else {
1218                    false
1219                }
1220            })
1221            .collect();
1222
1223        assert!(!string_chunks.is_empty(), "Should find string values");
1224        assert!(!boolean_chunks.is_empty(), "Should find boolean values");
1225    }
1226
1227    #[test]
1228    fn test_yaml_config_parser() {
1229        let parser = ConfigParser::new();
1230        let content = r#"database:
1231  host: localhost
1232  port: 5432
1233  name: myapp
1234
1235features:
1236  - auth
1237  - logging
1238  - metrics
1239
1240debug: true
1241version: "1.0.0"
1242"#;
1243
1244        let chunks = parser
1245            .parse(Path::new("config.yaml"), content, ConfigFormat::Yaml)
1246            .unwrap();
1247
1248        assert!(!chunks.is_empty(), "Should extract chunks from YAML");
1249
1250        // Should find key-value pairs
1251        let has_database = chunks
1252            .iter()
1253            .any(|chunk| chunk.content.contains("host: localhost"));
1254        let has_debug = chunks
1255            .iter()
1256            .any(|chunk| chunk.content.contains("debug: true"));
1257
1258        assert!(has_database, "Should find database configuration");
1259        assert!(has_debug, "Should find debug setting");
1260    }
1261
1262    #[test]
1263    fn test_toml_config_parser() {
1264        let parser = ConfigParser::new();
1265        let content = r#"[database]
1266host = "localhost"
1267port = 5432
1268name = "myapp"
1269
1270[features]
1271auth = true
1272logging = true
1273metrics = false
1274
1275debug = true
1276version = "1.0.0"
1277"#;
1278
1279        let chunks = parser
1280            .parse(Path::new("Cargo.toml"), content, ConfigFormat::Toml)
1281            .unwrap();
1282
1283        assert!(!chunks.is_empty(), "Should extract chunks from TOML");
1284
1285        // Should find sections and key-value pairs
1286        let sections: Vec<_> = chunks
1287            .iter()
1288            .filter(|chunk| {
1289                if let Some(metadata) = chunk.metadata.as_object() {
1290                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("section")
1291                } else {
1292                    false
1293                }
1294            })
1295            .collect();
1296
1297        assert!(sections.len() >= 2, "Should find at least 2 sections");
1298        assert!(sections.iter().any(|s| s.content == "database"));
1299        assert!(sections.iter().any(|s| s.content == "features"));
1300
1301        let key_values: Vec<_> = chunks
1302            .iter()
1303            .filter(|chunk| chunk.content.contains(" = "))
1304            .collect();
1305
1306        assert!(!key_values.is_empty(), "Should find key-value pairs");
1307    }
1308
1309    #[test]
1310    fn test_ini_config_parser() {
1311        let parser = ConfigParser::new();
1312        let content = r#"[database]
1313host=localhost
1314port=5432
1315name=myapp
1316
1317[logging]
1318level=info
1319file=/var/log/app.log
1320
1321debug=true
1322"#;
1323
1324        let chunks = parser
1325            .parse(Path::new("config.ini"), content, ConfigFormat::Ini)
1326            .unwrap();
1327
1328        assert!(!chunks.is_empty(), "Should extract chunks from INI");
1329
1330        let key_values: Vec<_> = chunks
1331            .iter()
1332            .filter(|chunk| chunk.content.contains("="))
1333            .collect();
1334
1335        assert!(
1336            key_values.len() >= 5,
1337            "Should find multiple key-value pairs"
1338        );
1339        assert!(key_values
1340            .iter()
1341            .any(|kv| kv.content.contains("host=localhost")));
1342        assert!(key_values
1343            .iter()
1344            .any(|kv| kv.content.contains("level=info")));
1345    }
1346
1347    #[test]
1348    fn test_properties_config_parser() {
1349        let parser = ConfigParser::new();
1350        let content = r#"# Application configuration
1351database.host=localhost
1352database.port=5432
1353database.name=myapp
1354
1355# Logging configuration  
1356logging.level=info
1357logging.file=/var/log/app.log
1358
1359debug=true
1360"#;
1361
1362        let chunks = parser
1363            .parse(
1364                Path::new("app.properties"),
1365                content,
1366                ConfigFormat::Properties,
1367            )
1368            .unwrap();
1369
1370        assert!(!chunks.is_empty(), "Should extract chunks from properties");
1371
1372        let properties: Vec<_> = chunks
1373            .iter()
1374            .filter(|chunk| chunk.content.contains("="))
1375            .collect();
1376
1377        assert!(properties.len() >= 5, "Should find multiple properties");
1378        assert!(properties
1379            .iter()
1380            .any(|p| p.content.contains("database.host=localhost")));
1381        assert!(properties
1382            .iter()
1383            .any(|p| p.content.contains("logging.level=info")));
1384    }
1385
1386    #[test]
1387    fn test_env_config_parser() {
1388        let parser = ConfigParser::new();
1389        let content = r#"DATABASE_HOST=localhost
1390DATABASE_PORT=5432
1391DATABASE_NAME=myapp
1392DEBUG=true
1393SECRET_KEY=abc123xyz
1394"#;
1395
1396        let chunks = parser
1397            .parse(Path::new(".env"), content, ConfigFormat::Env)
1398            .unwrap();
1399
1400        assert!(!chunks.is_empty(), "Should extract chunks from env file");
1401
1402        let env_vars: Vec<_> = chunks
1403            .iter()
1404            .filter(|chunk| chunk.content.contains("="))
1405            .collect();
1406
1407        assert_eq!(env_vars.len(), 5, "Should find 5 environment variables");
1408        assert!(env_vars
1409            .iter()
1410            .any(|var| var.content.contains("DATABASE_HOST=localhost")));
1411        assert!(env_vars
1412            .iter()
1413            .any(|var| var.content.contains("DEBUG=true")));
1414    }
1415
1416    #[test]
1417    fn test_xml_config_parser() {
1418        let parser = ConfigParser::new();
1419        let content = r#"<configuration>
1420  <database>
1421    <host>localhost</host>
1422    <port>5432</port>
1423    <name>myapp</name>
1424  </database>
1425  <features>
1426    <auth>true</auth>
1427    <logging>true</logging>
1428  </features>
1429  <debug>true</debug>
1430</configuration>"#;
1431
1432        let chunks = parser
1433            .parse(Path::new("config.xml"), content, ConfigFormat::Xml)
1434            .unwrap();
1435
1436        assert!(!chunks.is_empty(), "Should extract chunks from XML");
1437
1438        // Should find tag contents
1439        let tag_contents: Vec<_> = chunks
1440            .iter()
1441            .filter(|chunk| !chunk.content.trim().is_empty())
1442            .collect();
1443
1444        assert!(!tag_contents.is_empty(), "Should find tag contents");
1445        assert!(tag_contents.iter().any(|tag| tag.content == "localhost"));
1446        assert!(tag_contents.iter().any(|tag| tag.content == "5432"));
1447        assert!(tag_contents.iter().any(|tag| tag.content == "true"));
1448    }
1449
1450    #[test]
1451    fn test_text_parser_paragraphs() {
1452        let parser = TextParser::new();
1453        let content = r#"This is the first paragraph.
1454It has multiple lines.
1455
1456This is the second paragraph.
1457
1458This is the third paragraph.
1459It also has multiple lines.
1460And even more lines."#;
1461
1462        let chunks = parser
1463            .parse(
1464                Path::new("document.txt"),
1465                content,
1466                DocumentFormat::PlainText,
1467            )
1468            .unwrap();
1469
1470        assert_eq!(chunks.len(), 3, "Should find 3 paragraphs");
1471
1472        assert!(chunks[0].content.contains("first paragraph"));
1473        assert!(chunks[1].content.contains("second paragraph"));
1474        assert!(chunks[2].content.contains("third paragraph"));
1475
1476        // Check metadata
1477        for chunk in &chunks {
1478            let metadata = chunk.metadata.as_object().unwrap();
1479            assert_eq!(
1480                metadata.get("element_type").unwrap().as_str().unwrap(),
1481                "paragraph"
1482            );
1483            assert!(metadata.get("line_count").unwrap().as_u64().unwrap() >= 1);
1484        }
1485    }
1486
1487    #[test]
1488    fn test_invalid_json_handling() {
1489        let parser = ConfigParser::new();
1490        let invalid_json = r#"{ invalid json content here"#;
1491
1492        let chunks = parser
1493            .parse(Path::new("bad.json"), invalid_json, ConfigFormat::Json)
1494            .unwrap();
1495
1496        assert_eq!(
1497            chunks.len(),
1498            1,
1499            "Should create a single chunk for invalid JSON"
1500        );
1501        assert_eq!(chunks[0].content, invalid_json);
1502
1503        let metadata = chunks[0].metadata.as_object().unwrap();
1504        assert_eq!(
1505            metadata.get("parse_error").unwrap().as_bool().unwrap(),
1506            true
1507        );
1508        assert_eq!(
1509            metadata.get("config_type").unwrap().as_str().unwrap(),
1510            "json"
1511        );
1512    }
1513
1514    #[test]
1515    fn test_empty_content_handling() {
1516        let parser = DocumentParser::new();
1517
1518        let empty_md = "";
1519        let node = parser.parse_file(Path::new("empty.md"), empty_md).unwrap();
1520
1521        assert_eq!(
1522            node.chunks.len(),
1523            0,
1524            "Empty content should produce no chunks"
1525        );
1526        assert_eq!(node.file_size, 0);
1527    }
1528
1529    #[test]
1530    fn test_large_content_handling() {
1531        let parser = DocumentParser::new();
1532
1533        // Create a large markdown document
1534        let mut content = String::new();
1535        for i in 0..100 {
1536            content.push_str(&format!(
1537                "# Header {}\n\nThis is paragraph {} with some content.\n\n",
1538                i, i
1539            ));
1540        }
1541
1542        let node = parser.parse_file(Path::new("large.md"), &content).unwrap();
1543
1544        assert!(node.chunks.len() >= 100, "Should handle large content");
1545        assert_eq!(node.file_size, content.len());
1546
1547        // Should find headers and paragraphs
1548        let headers = node
1549            .chunks
1550            .iter()
1551            .filter(|chunk| {
1552                if let Some(metadata) = chunk.metadata.as_object() {
1553                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1554                } else {
1555                    false
1556                }
1557            })
1558            .count();
1559
1560        assert!(headers >= 100, "Should find many headers");
1561    }
1562
1563    #[test]
1564    fn test_content_span_calculation() {
1565        let parser = MarkdownParser::new();
1566        let content = "# Title\nSome content.";
1567
1568        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1569
1570        for chunk in chunks {
1571            assert!(
1572                chunk.span.start_byte < chunk.span.end_byte,
1573                "Start should be before end"
1574            );
1575            assert!(
1576                chunk.span.start_line <= chunk.span.end_line,
1577                "Start line should be <= end line"
1578            );
1579            assert!(chunk.span.start_column >= 1, "Column should be 1-indexed");
1580            assert!(
1581                chunk.span.end_byte <= content.len(),
1582                "End should not exceed content length"
1583            );
1584        }
1585    }
1586}
codeprism_core/content/parsers.rs

codeprism_core/content/
parsers.rs