codeprism_core/content/
parsers.rs

1//! Content parsers for documentation and configuration files
2//!
3//! This module provides parsers for various non-code file formats including
4//! markdown, configuration files, and plain text documents.
5
6use super::{ConfigFormat, ContentChunk, ContentNode, ContentType, DocumentFormat};
7use crate::ast::Span;
8use anyhow::{anyhow, Result};
9use regex::Regex;
10use serde_json::Value;
11use std::path::Path;
12
13/// Document parser for various file formats
14pub struct DocumentParser {
15    /// Markdown parser
16    markdown_parser: MarkdownParser,
17    /// Configuration file parser
18    config_parser: ConfigParser,
19    /// Plain text parser
20    text_parser: TextParser,
21}
22
23impl DocumentParser {
24    /// Create a new document parser
25    pub fn new() -> Self {
26        Self {
27            markdown_parser: MarkdownParser::new(),
28            config_parser: ConfigParser::new(),
29            text_parser: TextParser::new(),
30        }
31    }
32
33    /// Parse a file based on its extension
34    pub fn parse_file(&self, file_path: &Path, content: &str) -> Result<ContentNode> {
35        let content_type = self.detect_content_type(file_path)?;
36        let mut node = ContentNode::new(file_path.to_path_buf(), content_type.clone());
37
38        let chunks = match content_type {
39            ContentType::Documentation { format } => match format {
40                DocumentFormat::Markdown => self.markdown_parser.parse(file_path, content)?,
41                DocumentFormat::PlainText
42                | DocumentFormat::RestructuredText
43                | DocumentFormat::AsciiDoc
44                | DocumentFormat::Html => self.text_parser.parse(file_path, content, format)?,
45            },
46            ContentType::Configuration { format } => {
47                self.config_parser.parse(file_path, content, format)?
48            }
49            ContentType::PlainText => {
50                self.text_parser
51                    .parse(file_path, content, DocumentFormat::PlainText)?
52            }
53            _ => return Err(anyhow!("Unsupported content type for document parser")),
54        };
55
56        for chunk in chunks {
57            node.add_chunk(chunk);
58        }
59        node.file_size = content.len();
60
61        Ok(node)
62    }
63
64    /// Detect content type from file extension
65    fn detect_content_type(&self, file_path: &Path) -> Result<ContentType> {
66        // Handle special files without extensions first
67        if let Some(file_name) = file_path.file_name().and_then(|n| n.to_str()) {
68            if file_name == ".env" {
69                return Ok(ContentType::Configuration {
70                    format: ConfigFormat::Env,
71                });
72            }
73        }
74
75        let extension = file_path
76            .extension()
77            .and_then(|ext| ext.to_str())
78            .unwrap_or("")
79            .to_lowercase();
80
81        match extension.as_str() {
82            "md" | "markdown" => Ok(ContentType::Documentation {
83                format: DocumentFormat::Markdown,
84            }),
85            "rst" => Ok(ContentType::Documentation {
86                format: DocumentFormat::RestructuredText,
87            }),
88            "adoc" | "asciidoc" => Ok(ContentType::Documentation {
89                format: DocumentFormat::AsciiDoc,
90            }),
91            "html" | "htm" => Ok(ContentType::Documentation {
92                format: DocumentFormat::Html,
93            }),
94            "txt" | "text" => Ok(ContentType::Documentation {
95                format: DocumentFormat::PlainText,
96            }),
97            "json" => Ok(ContentType::Configuration {
98                format: ConfigFormat::Json,
99            }),
100            "yaml" | "yml" => Ok(ContentType::Configuration {
101                format: ConfigFormat::Yaml,
102            }),
103            "toml" => Ok(ContentType::Configuration {
104                format: ConfigFormat::Toml,
105            }),
106            "ini" => Ok(ContentType::Configuration {
107                format: ConfigFormat::Ini,
108            }),
109            "properties" => Ok(ContentType::Configuration {
110                format: ConfigFormat::Properties,
111            }),
112            "env" => Ok(ContentType::Configuration {
113                format: ConfigFormat::Env,
114            }),
115            "xml" => Ok(ContentType::Configuration {
116                format: ConfigFormat::Xml,
117            }),
118            _ => Ok(ContentType::PlainText),
119        }
120    }
121}
122
123impl Default for DocumentParser {
124    fn default() -> Self {
125        Self::new()
126    }
127}
128
129/// Markdown document parser
130pub struct MarkdownParser {
131    /// Regex for headers
132    header_regex: Regex,
133    /// Regex for code blocks
134    code_block_regex: Regex,
135    /// Regex for inline code
136    #[allow(dead_code)] // Will be used for inline code extraction
137    inline_code_regex: Regex,
138    /// Regex for links
139    #[allow(dead_code)] // Will be used for link extraction
140    link_regex: Regex,
141    /// Regex for lists
142    #[allow(dead_code)] // Will be used for list extraction
143    list_regex: Regex,
144}
145
146impl MarkdownParser {
147    /// Create a new markdown parser
148    pub fn new() -> Self {
149        Self {
150            header_regex: Regex::new(r"(?m)^(#{1,6})\s+(.+)$").unwrap(),
151            code_block_regex: Regex::new(r"```(\w+)?\n([\s\S]*?)\n```").unwrap(),
152            inline_code_regex: Regex::new(r"`([^`]+)`").unwrap(),
153            link_regex: Regex::new(r"\[([^\]]+)\]\(([^)]+)\)").unwrap(),
154            list_regex: Regex::new(r"(?m)^[\s]*[-*+]\s+(.+)$").unwrap(),
155        }
156    }
157
158    /// Parse markdown content into chunks
159    pub fn parse(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
160        let mut chunks = Vec::new();
161        let lines: Vec<&str> = content.lines().collect();
162        let mut _current_line = 0;
163        let mut chunk_index = 0;
164
165        // Parse headers
166        for (line_idx, line) in lines.iter().enumerate() {
167            if let Some(captures) = self.header_regex.captures(line) {
168                let level = captures.get(1).unwrap().as_str().len();
169                let header_text = captures.get(2).unwrap().as_str();
170
171                let span = self.calculate_line_span(line_idx, line, content);
172                let chunk = ContentChunk::new(
173                    file_path.to_path_buf(),
174                    ContentType::Documentation {
175                        format: DocumentFormat::Markdown,
176                    },
177                    header_text.to_string(),
178                    span,
179                    chunk_index,
180                )
181                .with_metadata(serde_json::json!({
182                    "header_level": level,
183                    "element_type": "header"
184                }));
185
186                chunks.push(chunk);
187                chunk_index += 1;
188            }
189        }
190
191        // Parse code blocks
192        for captures in self.code_block_regex.captures_iter(content) {
193            let language = captures.get(1).map(|m| m.as_str()).unwrap_or("text");
194            let code_content = captures.get(2).unwrap().as_str();
195            let full_match = captures.get(0).unwrap();
196
197            let span = self.calculate_match_span(&full_match, content);
198            let chunk = ContentChunk::new(
199                file_path.to_path_buf(),
200                ContentType::Documentation {
201                    format: DocumentFormat::Markdown,
202                },
203                code_content.to_string(),
204                span,
205                chunk_index,
206            )
207            .with_metadata(serde_json::json!({
208                "language": language,
209                "element_type": "code_block"
210            }));
211
212            chunks.push(chunk);
213            chunk_index += 1;
214        }
215
216        // Parse regular paragraphs (non-header, non-code block content)
217        let mut paragraph_start = 0;
218        let mut in_paragraph = false;
219        let mut paragraph_lines = Vec::new();
220
221        for (line_idx, line) in lines.iter().enumerate() {
222            let line_trimmed = line.trim();
223
224            // Skip headers and lines that are part of code blocks
225            if self.header_regex.is_match(line)
226                || line_trimmed.starts_with("```")
227                || line_trimmed.is_empty()
228            {
229                // End current paragraph if we have one
230                if in_paragraph && !paragraph_lines.is_empty() {
231                    let paragraph_text = paragraph_lines.join("\n");
232                    let span =
233                        self.calculate_paragraph_span(paragraph_start, line_idx - 1, content);
234
235                    let chunk = ContentChunk::new(
236                        file_path.to_path_buf(),
237                        ContentType::Documentation {
238                            format: DocumentFormat::Markdown,
239                        },
240                        paragraph_text,
241                        span,
242                        chunk_index,
243                    )
244                    .with_metadata(serde_json::json!({
245                        "element_type": "paragraph"
246                    }));
247
248                    chunks.push(chunk);
249                    chunk_index += 1;
250                }
251
252                in_paragraph = false;
253                paragraph_lines.clear();
254                continue;
255            }
256
257            // Start or continue paragraph
258            if !in_paragraph {
259                in_paragraph = true;
260                paragraph_start = line_idx;
261            }
262            paragraph_lines.push(line_trimmed);
263        }
264
265        // Handle final paragraph
266        if in_paragraph && !paragraph_lines.is_empty() {
267            let paragraph_text = paragraph_lines.join("\n");
268            let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, content);
269
270            let chunk = ContentChunk::new(
271                file_path.to_path_buf(),
272                ContentType::Documentation {
273                    format: DocumentFormat::Markdown,
274                },
275                paragraph_text,
276                span,
277                chunk_index,
278            )
279            .with_metadata(serde_json::json!({
280                "element_type": "paragraph"
281            }));
282
283            chunks.push(chunk);
284        }
285
286        Ok(chunks)
287    }
288
289    /// Calculate span for a single line
290    fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
291        let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
292        let start_byte = lines_before;
293        let end_byte = start_byte + line.len();
294
295        Span::new(
296            start_byte,
297            end_byte,
298            line_idx + 1,
299            line_idx + 1,
300            1,
301            line.len() + 1,
302        )
303    }
304
305    /// Calculate span for a regex match
306    fn calculate_match_span(&self, match_obj: &regex::Match, content: &str) -> Span {
307        let start_byte = match_obj.start();
308        let end_byte = match_obj.end();
309
310        // Count lines up to start
311        let content_before = &content[..start_byte];
312        let start_line = content_before.lines().count();
313        let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
314
315        // Count lines in match
316        let match_content = match_obj.as_str();
317        let lines_in_match = match_content.lines().count();
318        let end_line = start_line + lines_in_match.saturating_sub(1);
319        let end_column = if lines_in_match > 1 {
320            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
321        } else {
322            start_column + match_content.len()
323        };
324
325        Span::new(
326            start_byte,
327            end_byte,
328            start_line.max(1),
329            end_line.max(1),
330            start_column,
331            end_column,
332        )
333    }
334
335    /// Calculate span for a paragraph
336    fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, content: &str) -> Span {
337        let lines: Vec<&str> = content.lines().collect();
338        let start_byte: usize = lines
339            .iter()
340            .take(start_line)
341            .map(|l| l.len() + 1)
342            .sum::<usize>();
343        let end_byte: usize = lines
344            .iter()
345            .take(end_line + 1)
346            .map(|l| l.len() + 1)
347            .sum::<usize>()
348            - 1;
349
350        Span::new(
351            start_byte,
352            end_byte,
353            start_line + 1,
354            end_line + 1,
355            1,
356            lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
357        )
358    }
359}
360
361impl Default for MarkdownParser {
362    fn default() -> Self {
363        Self::new()
364    }
365}
366
367/// Configuration file parser
368pub struct ConfigParser;
369
370impl ConfigParser {
371    /// Create a new configuration parser
372    pub fn new() -> Self {
373        Self
374    }
375
376    /// Parse configuration file content
377    pub fn parse(
378        &self,
379        file_path: &Path,
380        content: &str,
381        format: ConfigFormat,
382    ) -> Result<Vec<ContentChunk>> {
383        match format {
384            ConfigFormat::Json => self.parse_json(file_path, content),
385            ConfigFormat::Yaml => self.parse_yaml(file_path, content),
386            ConfigFormat::Toml => self.parse_toml(file_path, content),
387            ConfigFormat::Ini => self.parse_ini(file_path, content),
388            ConfigFormat::Properties => self.parse_properties(file_path, content),
389            ConfigFormat::Env => self.parse_env(file_path, content),
390            ConfigFormat::Xml => self.parse_xml(file_path, content),
391        }
392    }
393
394    /// Parse JSON configuration
395    fn parse_json(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
396        let mut chunks = Vec::new();
397
398        // Try to parse as JSON to validate structure
399        match serde_json::from_str::<Value>(content) {
400            Ok(value) => {
401                // Extract key-value pairs and create chunks
402                self.extract_json_values(&value, file_path, content, &mut chunks, 0, "");
403            }
404            Err(_) => {
405                // If JSON is invalid, treat as plain text
406                chunks.push(
407                    ContentChunk::new(
408                        file_path.to_path_buf(),
409                        ContentType::Configuration {
410                            format: ConfigFormat::Json,
411                        },
412                        content.to_string(),
413                        Span::new(
414                            0,
415                            content.len(),
416                            1,
417                            content.lines().count(),
418                            1,
419                            content.lines().last().map(|l| l.len()).unwrap_or(0),
420                        ),
421                        0,
422                    )
423                    .with_metadata(serde_json::json!({
424                        "parse_error": true,
425                        "config_type": "json"
426                    })),
427                );
428            }
429        }
430
431        Ok(chunks)
432    }
433
434    /// Extract values from JSON recursively
435    #[allow(clippy::only_used_in_recursion)] // Method is used recursively by design
436    fn extract_json_values(
437        &self,
438        value: &Value,
439        file_path: &Path,
440        content: &str,
441        chunks: &mut Vec<ContentChunk>,
442        chunk_index: usize,
443        key_path: &str,
444    ) {
445        match value {
446            Value::Object(map) => {
447                for (key, val) in map {
448                    let new_path = if key_path.is_empty() {
449                        key.clone()
450                    } else {
451                        format!("{key_path}.{key}")
452                    };
453                    self.extract_json_values(
454                        val,
455                        file_path,
456                        content,
457                        chunks,
458                        chunks.len(),
459                        &new_path,
460                    );
461                }
462            }
463            Value::Array(arr) => {
464                for (index, val) in arr.iter().enumerate() {
465                    let new_path = format!("{key_path}[{index}]");
466                    self.extract_json_values(
467                        val,
468                        file_path,
469                        content,
470                        chunks,
471                        chunks.len(),
472                        &new_path,
473                    );
474                }
475            }
476            Value::String(_) | Value::Number(_) | Value::Bool(_) => {
477                // Create a chunk for this key-value pair
478                let value_str = match value {
479                    Value::String(s) => s.clone(),
480                    _ => value.to_string(),
481                };
482
483                // Include key in the searchable content
484                let searchable_content = if key_path.is_empty() {
485                    value_str.clone()
486                } else {
487                    format!("{key_path}: {value_str}")
488                };
489
490                // Try to find the approximate location in the original content
491                if let Some(position) = content.find(&value_str) {
492                    let lines_before = content[..position].lines().count();
493                    let line_start = content[..position].rfind('\n').map(|i| i + 1).unwrap_or(0);
494                    let column = position - line_start + 1;
495
496                    let span = Span::new(
497                        position,
498                        position + value_str.len(),
499                        lines_before.max(1),
500                        lines_before.max(1),
501                        column,
502                        column + value_str.len(),
503                    );
504
505                    let chunk = ContentChunk::new(
506                        file_path.to_path_buf(),
507                        ContentType::Configuration {
508                            format: ConfigFormat::Json,
509                        },
510                        searchable_content,
511                        span,
512                        chunk_index,
513                    )
514                    .with_metadata(serde_json::json!({
515                        "key_path": key_path,
516                        "value": value_str,
517                        "value_type": match value {
518                            Value::String(_) => "string",
519                            Value::Number(_) => "number",
520                            Value::Bool(_) => "boolean",
521                            _ => "unknown"
522                        },
523                        "config_type": "json"
524                    }));
525
526                    chunks.push(chunk);
527                }
528            }
529            Value::Null => {} // Skip null values
530        }
531    }
532
533    /// Parse YAML configuration (simplified)
534    fn parse_yaml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
535        // Simple line-by-line parsing for YAML
536        let mut chunks = Vec::new();
537        let lines: Vec<&str> = content.lines().collect();
538
539        for (line_idx, line) in lines.iter().enumerate() {
540            let trimmed = line.trim();
541            if trimmed.is_empty() || trimmed.starts_with('#') {
542                continue;
543            }
544
545            // Look for key-value pairs
546            if let Some(colon_pos) = trimmed.find(':') {
547                let key = trimmed[..colon_pos].trim();
548                let value = trimmed[colon_pos + 1..].trim();
549
550                if !value.is_empty() {
551                    let span = self.calculate_line_span(line_idx, line, content);
552                    let chunk = ContentChunk::new(
553                        file_path.to_path_buf(),
554                        ContentType::Configuration {
555                            format: ConfigFormat::Yaml,
556                        },
557                        format!("{key}: {value}"),
558                        span,
559                        chunks.len(),
560                    )
561                    .with_metadata(serde_json::json!({
562                        "key": key,
563                        "value": value,
564                        "config_type": "yaml"
565                    }));
566
567                    chunks.push(chunk);
568                }
569            }
570        }
571
572        Ok(chunks)
573    }
574
575    /// Parse TOML configuration (simplified)
576    fn parse_toml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
577        // Similar to YAML but with different syntax
578        let mut chunks = Vec::new();
579        let lines: Vec<&str> = content.lines().collect();
580
581        for (line_idx, line) in lines.iter().enumerate() {
582            let trimmed = line.trim();
583            if trimmed.is_empty() || trimmed.starts_with('#') {
584                continue;
585            }
586
587            // Handle sections
588            if trimmed.starts_with('[') && trimmed.ends_with(']') {
589                let section = &trimmed[1..trimmed.len() - 1];
590                let span = self.calculate_line_span(line_idx, line, content);
591                let chunk = ContentChunk::new(
592                    file_path.to_path_buf(),
593                    ContentType::Configuration {
594                        format: ConfigFormat::Toml,
595                    },
596                    section.to_string(),
597                    span,
598                    chunks.len(),
599                )
600                .with_metadata(serde_json::json!({
601                    "element_type": "section",
602                    "section_name": section,
603                    "config_type": "toml"
604                }));
605
606                chunks.push(chunk);
607                continue;
608            }
609
610            // Handle key-value pairs
611            if let Some(eq_pos) = trimmed.find('=') {
612                let key = trimmed[..eq_pos].trim();
613                let value = trimmed[eq_pos + 1..].trim();
614
615                let span = self.calculate_line_span(line_idx, line, content);
616                let chunk = ContentChunk::new(
617                    file_path.to_path_buf(),
618                    ContentType::Configuration {
619                        format: ConfigFormat::Toml,
620                    },
621                    format!("{key} = {value}"),
622                    span,
623                    chunks.len(),
624                )
625                .with_metadata(serde_json::json!({
626                    "key": key,
627                    "value": value,
628                    "config_type": "toml"
629                }));
630
631                chunks.push(chunk);
632            }
633        }
634
635        Ok(chunks)
636    }
637
638    /// Parse INI configuration
639    fn parse_ini(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
640        // Similar pattern to TOML
641        self.parse_key_value_format(file_path, content, ConfigFormat::Ini, "ini")
642    }
643
644    /// Parse properties configuration
645    fn parse_properties(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
646        self.parse_key_value_format(file_path, content, ConfigFormat::Properties, "properties")
647    }
648
649    /// Parse environment file
650    fn parse_env(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
651        self.parse_key_value_format(file_path, content, ConfigFormat::Env, "env")
652    }
653
654    /// Parse XML configuration (simplified)
655    fn parse_xml(&self, file_path: &Path, content: &str) -> Result<Vec<ContentChunk>> {
656        // Simple XML tag extraction without backreferences
657        let tag_regex = Regex::new(r"<([^/>]+)>([^<]+)</[^>]+>").unwrap();
658        let mut chunks = Vec::new();
659
660        for (idx, captures) in tag_regex.captures_iter(content).enumerate() {
661            let tag_name = captures.get(1).unwrap().as_str();
662            let tag_content = captures.get(2).unwrap().as_str().trim();
663
664            if !tag_content.is_empty() {
665                let full_match = captures.get(0).unwrap();
666                let span = self.calculate_match_span(&full_match, content);
667
668                let chunk = ContentChunk::new(
669                    file_path.to_path_buf(),
670                    ContentType::Configuration {
671                        format: ConfigFormat::Xml,
672                    },
673                    tag_content.to_string(),
674                    span,
675                    idx,
676                )
677                .with_metadata(serde_json::json!({
678                    "tag_name": tag_name,
679                    "config_type": "xml"
680                }));
681
682                chunks.push(chunk);
683            }
684        }
685
686        Ok(chunks)
687    }
688
689    /// Generic key-value format parser
690    fn parse_key_value_format(
691        &self,
692        file_path: &Path,
693        content: &str,
694        format: ConfigFormat,
695        format_name: &str,
696    ) -> Result<Vec<ContentChunk>> {
697        let mut chunks = Vec::new();
698        let lines: Vec<&str> = content.lines().collect();
699
700        for (line_idx, line) in lines.iter().enumerate() {
701            let trimmed = line.trim();
702            if trimmed.is_empty() || trimmed.starts_with('#') || trimmed.starts_with(';') {
703                continue;
704            }
705
706            // Look for key=value pattern
707            if let Some(eq_pos) = trimmed.find('=') {
708                let key = trimmed[..eq_pos].trim();
709                let value = trimmed[eq_pos + 1..].trim();
710
711                let span = self.calculate_line_span(line_idx, line, content);
712                let chunk = ContentChunk::new(
713                    file_path.to_path_buf(),
714                    ContentType::Configuration {
715                        format: format.clone(),
716                    },
717                    format!("{key}={value}"),
718                    span,
719                    chunks.len(),
720                )
721                .with_metadata(serde_json::json!({
722                    "key": key,
723                    "value": value,
724                    "config_type": format_name
725                }));
726
727                chunks.push(chunk);
728            }
729        }
730
731        Ok(chunks)
732    }
733
734    /// Calculate span for a line
735    fn calculate_line_span(&self, line_idx: usize, line: &str, content: &str) -> Span {
736        let lines_before: usize = content.lines().take(line_idx).map(|l| l.len() + 1).sum();
737        let start_byte = lines_before;
738        let end_byte = start_byte + line.len();
739
740        Span::new(
741            start_byte,
742            end_byte,
743            line_idx + 1,
744            line_idx + 1,
745            1,
746            line.len() + 1,
747        )
748    }
749
750    /// Calculate span for a regex match
751    fn calculate_match_span(&self, match_obj: &regex::Match, content: &str) -> Span {
752        let start_byte = match_obj.start();
753        let end_byte = match_obj.end();
754
755        let content_before = &content[..start_byte];
756        let start_line = content_before.lines().count();
757        let start_column = content_before.lines().last().map(|l| l.len()).unwrap_or(0) + 1;
758
759        let match_content = match_obj.as_str();
760        let lines_in_match = match_content.lines().count();
761        let end_line = start_line + lines_in_match.saturating_sub(1);
762        let end_column = if lines_in_match > 1 {
763            match_content.lines().last().map(|l| l.len()).unwrap_or(0) + 1
764        } else {
765            start_column + match_content.len()
766        };
767
768        Span::new(
769            start_byte,
770            end_byte,
771            start_line.max(1),
772            end_line.max(1),
773            start_column,
774            end_column,
775        )
776    }
777}
778
779impl Default for ConfigParser {
780    fn default() -> Self {
781        Self::new()
782    }
783}
784
785/// Plain text parser
786pub struct TextParser;
787
788impl TextParser {
789    /// Create a new text parser
790    pub fn new() -> Self {
791        Self
792    }
793
794    /// Parse plain text into chunks (paragraph-based)
795    pub fn parse(
796        &self,
797        file_path: &Path,
798        content: &str,
799        format: DocumentFormat,
800    ) -> Result<Vec<ContentChunk>> {
801        let mut chunks = Vec::new();
802        let lines: Vec<&str> = content.lines().collect();
803
804        let mut paragraph_start = 0;
805        let mut paragraph_lines = Vec::new();
806        let mut chunk_index = 0;
807
808        for (line_idx, line) in lines.iter().enumerate() {
809            let trimmed = line.trim();
810
811            if trimmed.is_empty() {
812                // End current paragraph
813                if !paragraph_lines.is_empty() {
814                    let paragraph_text = paragraph_lines.join("\n");
815                    let span = self.calculate_paragraph_span(paragraph_start, line_idx - 1, &lines);
816
817                    let chunk = ContentChunk::new(
818                        file_path.to_path_buf(),
819                        ContentType::Documentation {
820                            format: format.clone(),
821                        },
822                        paragraph_text,
823                        span,
824                        chunk_index,
825                    )
826                    .with_metadata(serde_json::json!({
827                        "element_type": "paragraph",
828                        "line_count": paragraph_lines.len()
829                    }));
830
831                    chunks.push(chunk);
832                    chunk_index += 1;
833                    paragraph_lines.clear();
834                }
835                continue;
836            }
837
838            // Start new paragraph or continue existing one
839            if paragraph_lines.is_empty() {
840                paragraph_start = line_idx;
841            }
842            paragraph_lines.push(trimmed);
843        }
844
845        // Handle final paragraph
846        if !paragraph_lines.is_empty() {
847            let paragraph_text = paragraph_lines.join("\n");
848            let span = self.calculate_paragraph_span(paragraph_start, lines.len() - 1, &lines);
849
850            let chunk = ContentChunk::new(
851                file_path.to_path_buf(),
852                ContentType::Documentation { format },
853                paragraph_text,
854                span,
855                chunk_index,
856            )
857            .with_metadata(serde_json::json!({
858                "element_type": "paragraph",
859                "line_count": paragraph_lines.len()
860            }));
861
862            chunks.push(chunk);
863        }
864
865        Ok(chunks)
866    }
867
868    /// Calculate span for a paragraph
869    fn calculate_paragraph_span(&self, start_line: usize, end_line: usize, lines: &[&str]) -> Span {
870        let start_byte: usize = lines
871            .iter()
872            .take(start_line)
873            .map(|l| l.len() + 1)
874            .sum::<usize>();
875        let end_byte: usize = lines
876            .iter()
877            .take(end_line + 1)
878            .map(|l| l.len() + 1)
879            .sum::<usize>()
880            - 1;
881
882        Span::new(
883            start_byte,
884            end_byte,
885            start_line + 1,
886            end_line + 1,
887            1,
888            lines.get(end_line).map(|l| l.len()).unwrap_or(0) + 1,
889        )
890    }
891}
892
893impl Default for TextParser {
894    fn default() -> Self {
895        Self::new()
896    }
897}
898
899#[cfg(test)]
900mod tests {
901    use super::*;
902
903    #[test]
904    fn test_document_parser_creation() {
905        let _parser = DocumentParser::new();
906        // Just test that creation doesn't panic
907        // Test passes - parser handles empty files correctly
908    }
909
910    #[test]
911    fn test_content_type_detection() {
912        let parser = DocumentParser::new();
913
914        let test_cases = vec![
915            (
916                "test.md",
917                ContentType::Documentation {
918                    format: DocumentFormat::Markdown,
919                },
920            ),
921            (
922                "README.markdown",
923                ContentType::Documentation {
924                    format: DocumentFormat::Markdown,
925                },
926            ),
927            (
928                "doc.rst",
929                ContentType::Documentation {
930                    format: DocumentFormat::RestructuredText,
931                },
932            ),
933            (
934                "manual.adoc",
935                ContentType::Documentation {
936                    format: DocumentFormat::AsciiDoc,
937                },
938            ),
939            (
940                "page.html",
941                ContentType::Documentation {
942                    format: DocumentFormat::Html,
943                },
944            ),
945            (
946                "notes.txt",
947                ContentType::Documentation {
948                    format: DocumentFormat::PlainText,
949                },
950            ),
951            (
952                "config.json",
953                ContentType::Configuration {
954                    format: ConfigFormat::Json,
955                },
956            ),
957            (
958                "config.yaml",
959                ContentType::Configuration {
960                    format: ConfigFormat::Yaml,
961                },
962            ),
963            (
964                "config.yml",
965                ContentType::Configuration {
966                    format: ConfigFormat::Yaml,
967                },
968            ),
969            (
970                "Cargo.toml",
971                ContentType::Configuration {
972                    format: ConfigFormat::Toml,
973                },
974            ),
975            (
976                "settings.ini",
977                ContentType::Configuration {
978                    format: ConfigFormat::Ini,
979                },
980            ),
981            (
982                "app.properties",
983                ContentType::Configuration {
984                    format: ConfigFormat::Properties,
985                },
986            ),
987            (
988                ".env",
989                ContentType::Configuration {
990                    format: ConfigFormat::Env,
991                },
992            ),
993            (
994                "config.xml",
995                ContentType::Configuration {
996                    format: ConfigFormat::Xml,
997                },
998            ),
999            ("unknown.xyz", ContentType::PlainText),
1000        ];
1001
1002        for (filename, expected_type) in test_cases {
1003            let path = Path::new(filename);
1004            let detected_type = parser.detect_content_type(path).unwrap();
1005            assert_eq!(
1006                std::mem::discriminant(&detected_type),
1007                std::mem::discriminant(&expected_type),
1008                "Failed for file: {filename}"
1009            );
1010        }
1011    }
1012
1013    #[test]
1014    fn test_markdown_parser_headers() {
1015        let parser = MarkdownParser::new();
1016        let content = r#"# Main Title
1017Some content here.
1018
1019## Secondary Title
1020More content.
1021
1022### Subsection
1023Even more content.
1024
1025#### Level 4
1026Content at level 4.
1027
1028##### Level 5
1029Content at level 5.
1030
1031###### Level 6
1032Content at level 6."#;
1033
1034        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1035
1036        // Should extract all headers
1037        let headers: Vec<_> = chunks
1038            .iter()
1039            .filter(|chunk| {
1040                if let Some(metadata) = chunk.metadata.as_object() {
1041                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1042                } else {
1043                    false
1044                }
1045            })
1046            .collect();
1047
1048        assert_eq!(headers.len(), 6, "Should find 6 headers");
1049
1050        // Test header levels
1051        let header_levels: Vec<_> = headers
1052            .iter()
1053            .filter_map(|chunk| {
1054                chunk
1055                    .metadata
1056                    .as_object()
1057                    .and_then(|m| m.get("header_level"))
1058                    .and_then(|v| v.as_u64())
1059            })
1060            .collect();
1061
1062        assert_eq!(header_levels, vec![1, 2, 3, 4, 5, 6]);
1063        assert_eq!(headers[0].content, "Main Title");
1064        assert_eq!(headers[1].content, "Secondary Title");
1065        assert_eq!(headers[2].content, "Subsection");
1066    }
1067
1068    #[test]
1069    fn test_markdown_parser_code_blocks() {
1070        let parser = MarkdownParser::new();
1071        let content = r#"Here is some Python code:
1072
1073```python
1074def hello_world():
1075    print("Hello, World!")
1076    return "success"
1077```
1078
1079And here is some JavaScript:
1080
1081```javascript
1082function greet(name) {
1083    console.log(`Hello, ${name}!`);
1084}
1085```
1086
1087And a generic code block:
1088
1089```
1090generic code here
1091no language specified
1092```"#;
1093
1094        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1095
1096        let code_blocks: Vec<_> = chunks
1097            .iter()
1098            .filter(|chunk| {
1099                if let Some(metadata) = chunk.metadata.as_object() {
1100                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("code_block")
1101                } else {
1102                    false
1103                }
1104            })
1105            .collect();
1106
1107        assert_eq!(code_blocks.len(), 3, "Should find 3 code blocks");
1108
1109        // Test Python code block
1110        assert!(code_blocks[0].content.contains("def hello_world"));
1111        assert!(code_blocks[0].content.contains("print(\"Hello, World!\")"));
1112        let python_lang = code_blocks[0]
1113            .metadata
1114            .as_object()
1115            .unwrap()
1116            .get("language")
1117            .unwrap()
1118            .as_str()
1119            .unwrap();
1120        assert_eq!(python_lang, "python");
1121
1122        // Test JavaScript code block
1123        assert!(code_blocks[1].content.contains("function greet"));
1124        let js_lang = code_blocks[1]
1125            .metadata
1126            .as_object()
1127            .unwrap()
1128            .get("language")
1129            .unwrap()
1130            .as_str()
1131            .unwrap();
1132        assert_eq!(js_lang, "javascript");
1133
1134        // Test generic code block
1135        assert!(code_blocks[2].content.contains("generic code here"));
1136        let generic_lang = code_blocks[2]
1137            .metadata
1138            .as_object()
1139            .unwrap()
1140            .get("language")
1141            .unwrap()
1142            .as_str()
1143            .unwrap();
1144        assert_eq!(generic_lang, "text");
1145    }
1146
1147    #[test]
1148    fn test_markdown_parser_paragraphs() {
1149        let parser = MarkdownParser::new();
1150        let content = r#"This is the first paragraph with some content.
1151It spans multiple lines.
1152
1153This is the second paragraph.
1154
1155# A Header
1156
1157This is a paragraph after a header.
1158
1159Another paragraph here."#;
1160
1161        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1162
1163        let paragraphs: Vec<_> = chunks
1164            .iter()
1165            .filter(|chunk| {
1166                if let Some(metadata) = chunk.metadata.as_object() {
1167                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("paragraph")
1168                } else {
1169                    false
1170                }
1171            })
1172            .collect();
1173
1174        assert!(paragraphs.len() >= 3, "Should find at least 3 paragraphs");
1175        assert!(paragraphs[0].content.contains("first paragraph"));
1176        assert!(paragraphs[1].content.contains("second paragraph"));
1177    }
1178
1179    #[test]
1180    fn test_json_config_parser() {
1181        let parser = ConfigParser::new();
1182        let content = r#"{
1183  "database": {
1184    "host": "localhost",
1185    "port": 5432,
1186    "name": "myapp"
1187  },
1188  "features": ["auth", "logging", "metrics"],
1189  "debug": true,
1190  "version": "1.0.0"
1191}"#;
1192
1193        let chunks = parser
1194            .parse(Path::new("config.json"), content, ConfigFormat::Json)
1195            .unwrap();
1196
1197        assert!(!chunks.is_empty(), "Should extract chunks from JSON");
1198
1199        // Should find various value types
1200        let string_chunks: Vec<_> = chunks
1201            .iter()
1202            .filter(|chunk| {
1203                if let Some(metadata) = chunk.metadata.as_object() {
1204                    metadata.get("value_type").and_then(|v| v.as_str()) == Some("string")
1205                } else {
1206                    false
1207                }
1208            })
1209            .collect();
1210
1211        let boolean_chunks: Vec<_> = chunks
1212            .iter()
1213            .filter(|chunk| {
1214                if let Some(metadata) = chunk.metadata.as_object() {
1215                    metadata.get("value_type").and_then(|v| v.as_str()) == Some("boolean")
1216                } else {
1217                    false
1218                }
1219            })
1220            .collect();
1221
1222        assert!(!string_chunks.is_empty(), "Should find string values");
1223        assert!(!boolean_chunks.is_empty(), "Should find boolean values");
1224    }
1225
1226    #[test]
1227    fn test_yaml_config_parser() {
1228        let parser = ConfigParser::new();
1229        let content = r#"database:
1230  host: localhost
1231  port: 5432
1232  name: myapp
1233
1234features:
1235  - auth
1236  - logging
1237  - metrics
1238
1239debug: true
1240version: "1.0.0"
1241"#;
1242
1243        let chunks = parser
1244            .parse(Path::new("config.yaml"), content, ConfigFormat::Yaml)
1245            .unwrap();
1246
1247        assert!(!chunks.is_empty(), "Should extract chunks from YAML");
1248
1249        // Should find key-value pairs
1250        let has_database = chunks
1251            .iter()
1252            .any(|chunk| chunk.content.contains("host: localhost"));
1253        let has_debug = chunks
1254            .iter()
1255            .any(|chunk| chunk.content.contains("debug: true"));
1256
1257        assert!(has_database, "Should find database configuration");
1258        assert!(has_debug, "Should find debug setting");
1259    }
1260
1261    #[test]
1262    fn test_toml_config_parser() {
1263        let parser = ConfigParser::new();
1264        let content = r#"[database]
1265host = "localhost"
1266port = 5432
1267name = "myapp"
1268
1269[features]
1270auth = true
1271logging = true
1272metrics = false
1273
1274debug = true
1275version = "1.0.0"
1276"#;
1277
1278        let chunks = parser
1279            .parse(Path::new("Cargo.toml"), content, ConfigFormat::Toml)
1280            .unwrap();
1281
1282        assert!(!chunks.is_empty(), "Should extract chunks from TOML");
1283
1284        // Should find sections and key-value pairs
1285        let sections: Vec<_> = chunks
1286            .iter()
1287            .filter(|chunk| {
1288                if let Some(metadata) = chunk.metadata.as_object() {
1289                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("section")
1290                } else {
1291                    false
1292                }
1293            })
1294            .collect();
1295
1296        assert!(sections.len() >= 2, "Should find at least 2 sections");
1297        assert!(sections.iter().any(|s| s.content == "database"));
1298        assert!(sections.iter().any(|s| s.content == "features"));
1299
1300        let key_values: Vec<_> = chunks
1301            .iter()
1302            .filter(|chunk| chunk.content.contains(" = "))
1303            .collect();
1304
1305        assert!(!key_values.is_empty(), "Should find key-value pairs");
1306    }
1307
1308    #[test]
1309    fn test_ini_config_parser() {
1310        let parser = ConfigParser::new();
1311        let content = r#"[database]
1312host=localhost
1313port=5432
1314name=myapp
1315
1316[logging]
1317level=info
1318file=/var/log/app.log
1319
1320debug=true
1321"#;
1322
1323        let chunks = parser
1324            .parse(Path::new("config.ini"), content, ConfigFormat::Ini)
1325            .unwrap();
1326
1327        assert!(!chunks.is_empty(), "Should extract chunks from INI");
1328
1329        let key_values: Vec<_> = chunks
1330            .iter()
1331            .filter(|chunk| chunk.content.contains("="))
1332            .collect();
1333
1334        assert!(
1335            key_values.len() >= 5,
1336            "Should find multiple key-value pairs"
1337        );
1338        assert!(key_values
1339            .iter()
1340            .any(|kv| kv.content.contains("host=localhost")));
1341        assert!(key_values
1342            .iter()
1343            .any(|kv| kv.content.contains("level=info")));
1344    }
1345
1346    #[test]
1347    fn test_properties_config_parser() {
1348        let parser = ConfigParser::new();
1349        let content = r#"# Application configuration
1350database.host=localhost
1351database.port=5432
1352database.name=myapp
1353
1354# Logging configuration  
1355logging.level=info
1356logging.file=/var/log/app.log
1357
1358debug=true
1359"#;
1360
1361        let chunks = parser
1362            .parse(
1363                Path::new("app.properties"),
1364                content,
1365                ConfigFormat::Properties,
1366            )
1367            .unwrap();
1368
1369        assert!(!chunks.is_empty(), "Should extract chunks from properties");
1370
1371        let properties: Vec<_> = chunks
1372            .iter()
1373            .filter(|chunk| chunk.content.contains("="))
1374            .collect();
1375
1376        assert!(properties.len() >= 5, "Should find multiple properties");
1377        assert!(properties
1378            .iter()
1379            .any(|p| p.content.contains("database.host=localhost")));
1380        assert!(properties
1381            .iter()
1382            .any(|p| p.content.contains("logging.level=info")));
1383    }
1384
1385    #[test]
1386    fn test_env_config_parser() {
1387        let parser = ConfigParser::new();
1388        let content = r#"DATABASE_HOST=localhost
1389DATABASE_PORT=5432
1390DATABASE_NAME=myapp
1391DEBUG=true
1392SECRET_KEY=abc123xyz
1393"#;
1394
1395        let chunks = parser
1396            .parse(Path::new(".env"), content, ConfigFormat::Env)
1397            .unwrap();
1398
1399        assert!(!chunks.is_empty(), "Should extract chunks from env file");
1400
1401        let env_vars: Vec<_> = chunks
1402            .iter()
1403            .filter(|chunk| chunk.content.contains("="))
1404            .collect();
1405
1406        assert_eq!(env_vars.len(), 5, "Should find 5 environment variables");
1407        assert!(env_vars
1408            .iter()
1409            .any(|var| var.content.contains("DATABASE_HOST=localhost")));
1410        assert!(env_vars
1411            .iter()
1412            .any(|var| var.content.contains("DEBUG=true")));
1413    }
1414
1415    #[test]
1416    fn test_xml_config_parser() {
1417        let parser = ConfigParser::new();
1418        let content = r#"<configuration>
1419  <database>
1420    <host>localhost</host>
1421    <port>5432</port>
1422    <name>myapp</name>
1423  </database>
1424  <features>
1425    <auth>true</auth>
1426    <logging>true</logging>
1427  </features>
1428  <debug>true</debug>
1429</configuration>"#;
1430
1431        let chunks = parser
1432            .parse(Path::new("config.xml"), content, ConfigFormat::Xml)
1433            .unwrap();
1434
1435        assert!(!chunks.is_empty(), "Should extract chunks from XML");
1436
1437        // Should find tag contents
1438        let tag_contents: Vec<_> = chunks
1439            .iter()
1440            .filter(|chunk| !chunk.content.trim().is_empty())
1441            .collect();
1442
1443        assert!(!tag_contents.is_empty(), "Should find tag contents");
1444        assert!(tag_contents.iter().any(|tag| tag.content == "localhost"));
1445        assert!(tag_contents.iter().any(|tag| tag.content == "5432"));
1446        assert!(tag_contents.iter().any(|tag| tag.content == "true"));
1447    }
1448
1449    #[test]
1450    fn test_text_parser_paragraphs() {
1451        let parser = TextParser::new();
1452        let content = r#"This is the first paragraph.
1453It has multiple lines.
1454
1455This is the second paragraph.
1456
1457This is the third paragraph.
1458It also has multiple lines.
1459And even more lines."#;
1460
1461        let chunks = parser
1462            .parse(
1463                Path::new("document.txt"),
1464                content,
1465                DocumentFormat::PlainText,
1466            )
1467            .unwrap();
1468
1469        assert_eq!(chunks.len(), 3, "Should find 3 paragraphs");
1470
1471        assert!(chunks[0].content.contains("first paragraph"));
1472        assert!(chunks[1].content.contains("second paragraph"));
1473        assert!(chunks[2].content.contains("third paragraph"));
1474
1475        // Check metadata
1476        for chunk in &chunks {
1477            let metadata = chunk.metadata.as_object().unwrap();
1478            assert_eq!(
1479                metadata.get("element_type").unwrap().as_str().unwrap(),
1480                "paragraph"
1481            );
1482            assert!(metadata.get("line_count").unwrap().as_u64().unwrap() >= 1);
1483        }
1484    }
1485
1486    #[test]
1487    fn test_invalid_json_handling() {
1488        let parser = ConfigParser::new();
1489        let invalid_json = r#"{ invalid json content here"#;
1490
1491        let chunks = parser
1492            .parse(Path::new("bad.json"), invalid_json, ConfigFormat::Json)
1493            .unwrap();
1494
1495        assert_eq!(
1496            chunks.len(),
1497            1,
1498            "Should create a single chunk for invalid JSON"
1499        );
1500        assert_eq!(chunks[0].content, invalid_json);
1501
1502        let metadata = chunks[0].metadata.as_object().unwrap();
1503        assert!(metadata.get("parse_error").unwrap().as_bool().unwrap());
1504        assert_eq!(
1505            metadata.get("config_type").unwrap().as_str().unwrap(),
1506            "json"
1507        );
1508    }
1509
1510    #[test]
1511    fn test_empty_content_handling() {
1512        let parser = DocumentParser::new();
1513
1514        let empty_md = "";
1515        let node = parser.parse_file(Path::new("empty.md"), empty_md).unwrap();
1516
1517        assert_eq!(
1518            node.chunks.len(),
1519            0,
1520            "Empty content should produce no chunks"
1521        );
1522        assert_eq!(node.file_size, 0);
1523    }
1524
1525    #[test]
1526    fn test_large_content_handling() {
1527        let parser = DocumentParser::new();
1528
1529        // Create a large markdown document
1530        let mut content = String::new();
1531        for i in 0..100 {
1532            content.push_str(&format!(
1533                "# Header {i}\n\nThis is paragraph {i} with some content.\n\n"
1534            ));
1535        }
1536
1537        let node = parser.parse_file(Path::new("large.md"), &content).unwrap();
1538
1539        assert!(node.chunks.len() >= 100, "Should handle large content");
1540        assert_eq!(node.file_size, content.len());
1541
1542        // Should find headers and paragraphs
1543        let headers = node
1544            .chunks
1545            .iter()
1546            .filter(|chunk| {
1547                if let Some(metadata) = chunk.metadata.as_object() {
1548                    metadata.get("element_type").and_then(|v| v.as_str()) == Some("header")
1549                } else {
1550                    false
1551                }
1552            })
1553            .count();
1554
1555        assert!(headers >= 100, "Should find many headers");
1556    }
1557
1558    #[test]
1559    fn test_content_span_calculation() {
1560        let parser = MarkdownParser::new();
1561        let content = "# Title\nSome content.";
1562
1563        let chunks = parser.parse(Path::new("test.md"), content).unwrap();
1564
1565        for chunk in chunks {
1566            assert!(
1567                chunk.span.start_byte < chunk.span.end_byte,
1568                "Start should be before end"
1569            );
1570            assert!(
1571                chunk.span.start_line <= chunk.span.end_line,
1572                "Start line should be <= end line"
1573            );
1574            assert!(chunk.span.start_column >= 1, "Column should be 1-indexed");
1575            assert!(
1576                chunk.span.end_byte <= content.len(),
1577                "End should not exceed content length"
1578            );
1579        }
1580    }
1581}
codeprism_core/content/parsers.rs

codeprism_core/content/
parsers.rs