Skip to main content

cp_parser/
parsers.rs

1use cp_core::{CPError, Result, text};
2use std::path::Path;
3
4/// Trait for document parsers
5pub trait Parser: Send + Sync {
6    /// Parse a file and return its text content
7    fn parse(&self, path: &Path) -> Result<String>;
8    
9    /// Get supported file extensions
10    fn supported_extensions(&self) -> &[&str];
11}
12
13/// Registry of available parsers
14pub struct ParserRegistry {
15    parsers: Vec<Box<dyn Parser>>,
16}
17
18impl Default for ParserRegistry {
19    fn default() -> Self {
20        Self::new()
21    }
22}
23
24impl ParserRegistry {
25    /// Create a new registry with default parsers
26    pub fn new() -> Self {
27        Self {
28            parsers: vec![
29                Box::new(MarkdownParser),
30                Box::new(TextParser),
31                Box::new(PdfParser),
32            ],
33        }
34    }
35
36    /// Find a parser for the given file extension
37    pub fn find_parser(&self, extension: &str) -> Option<&dyn Parser> {
38        for parser in &self.parsers {
39            if parser
40                .supported_extensions()
41                .iter()
42                .any(|e| e.eq_ignore_ascii_case(extension))
43            {
44                return Some(parser.as_ref());
45            }
46        }
47        None
48    }
49}
50
51/// Parse a file using the appropriate parser
52pub fn parse_file(path: &Path) -> Result<String> {
53    let registry = ParserRegistry::new();
54    
55    let extension = path
56        .extension()
57        .and_then(|e| e.to_str())
58        .ok_or_else(|| CPError::Parse("No file extension".into()))?;
59
60    let parser = registry
61        .find_parser(extension)
62        .ok_or_else(|| CPError::Parse(format!("No parser for extension: {}", extension)))?;
63
64    parser.parse(path)
65}
66
67/// Markdown parser using pulldown-cmark
68struct MarkdownParser;
69
70impl Parser for MarkdownParser {
71    fn parse(&self, path: &Path) -> Result<String> {
72        let content = std::fs::read_to_string(path)?;
73        
74        // Convert markdown to plain text
75        let parser = pulldown_cmark::Parser::new(&content);
76        let mut text = String::new();
77        
78        for event in parser {
79            match event {
80                pulldown_cmark::Event::Start(pulldown_cmark::Tag::Heading { level, .. }) => {
81                    text.push('\n');
82                    let level_str = match level {
83                        pulldown_cmark::HeadingLevel::H1 => "# ",
84                        pulldown_cmark::HeadingLevel::H2 => "## ",
85                        pulldown_cmark::HeadingLevel::H3 => "### ",
86                        pulldown_cmark::HeadingLevel::H4 => "#### ",
87                        pulldown_cmark::HeadingLevel::H5 => "##### ",
88                        pulldown_cmark::HeadingLevel::H6 => "###### ",
89                    };
90                    text.push_str(level_str);
91                }
92                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_)) => {
93                    text.push('\n');
94                }
95                pulldown_cmark::Event::Text(t) 
96                | pulldown_cmark::Event::Code(t) => {
97                    text.push_str(&t);
98                }
99                pulldown_cmark::Event::SoftBreak 
100                | pulldown_cmark::Event::HardBreak => {
101                    text.push('\n');
102                }
103                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph) => {
104                    text.push_str("\n\n");
105                }
106                _ => {}
107            }
108        }
109        
110        Ok(text::normalize(&text))
111    }
112
113    fn supported_extensions(&self) -> &[&str] {
114        &["md", "markdown"]
115    }
116}
117
118/// Plain text parser
119struct TextParser;
120
121impl Parser for TextParser {
122    fn parse(&self, path: &Path) -> Result<String> {
123        let content = std::fs::read_to_string(path)?;
124        Ok(text::normalize(&content))
125    }
126
127    fn supported_extensions(&self) -> &[&str] {
128        &["txt", "text"]
129    }
130}
131
132/// PDF parser using pdf-extract
133struct PdfParser;
134
135impl Parser for PdfParser {
136    fn parse(&self, path: &Path) -> Result<String> {
137        let bytes = std::fs::read(path)?;
138        let text = pdf_extract::extract_text_from_mem(&bytes)
139            .map_err(|e| CPError::Parse(format!("PDF extraction failed: {}", e)))?;
140        Ok(text::normalize(&text))
141    }
142
143    fn supported_extensions(&self) -> &[&str] {
144        &["pdf"]
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151
152    #[test]
153    fn test_registry_finds_markdown() {
154        let registry = ParserRegistry::new();
155        assert!(registry.find_parser("md").is_some());
156        assert!(registry.find_parser("markdown").is_some());
157    }
158
159    #[test]
160    fn test_registry_finds_pdf() {
161        let registry = ParserRegistry::new();
162        assert!(registry.find_parser("pdf").is_some());
163    }
164
165    #[test]
166    fn test_unknown_extension() {
167        let registry = ParserRegistry::new();
168        assert!(registry.find_parser("xyz").is_none());
169    }
170}