Skip to main content

cp_parser/
parsers.rs

1use cp_core::{text, CPError, Result};
2use std::path::Path;
3
4/// Trait for document parsers
5pub trait Parser: Send + Sync {
6    /// Parse a file and return its text content
7    fn parse(&self, path: &Path) -> Result<String>;
8
9    /// Get supported file extensions
10    fn supported_extensions(&self) -> &[&str];
11}
12
13/// Registry of available parsers
14pub struct ParserRegistry {
15    parsers: Vec<Box<dyn Parser>>,
16}
17
18impl Default for ParserRegistry {
19    fn default() -> Self {
20        Self::new()
21    }
22}
23
24impl ParserRegistry {
25    /// Create a new registry with default parsers
26    pub fn new() -> Self {
27        Self {
28            parsers: vec![
29                Box::new(MarkdownParser),
30                Box::new(TextParser),
31                Box::new(PdfParser),
32                Box::new(DocxParser),
33            ],
34        }
35    }
36
37    /// Find a parser for the given file extension
38    pub fn find_parser(&self, extension: &str) -> Option<&dyn Parser> {
39        for parser in &self.parsers {
40            if parser
41                .supported_extensions()
42                .iter()
43                .any(|e| e.eq_ignore_ascii_case(extension))
44            {
45                return Some(parser.as_ref());
46            }
47        }
48        None
49    }
50}
51
52/// Parse a file using the appropriate parser
53pub fn parse_file(path: &Path) -> Result<String> {
54    let registry = ParserRegistry::new();
55
56    let extension = path
57        .extension()
58        .and_then(|e| e.to_str())
59        .ok_or_else(|| CPError::Parse("No file extension".into()))?;
60
61    let parser = registry
62        .find_parser(extension)
63        .ok_or_else(|| CPError::Parse(format!("No parser for extension: {extension}")))?;
64
65    parser.parse(path)
66}
67
68/// Markdown parser using pulldown-cmark
69struct MarkdownParser;
70
71impl Parser for MarkdownParser {
72    fn parse(&self, path: &Path) -> Result<String> {
73        let content = std::fs::read_to_string(path)?;
74
75        // Convert markdown to plain text
76        let parser = pulldown_cmark::Parser::new(&content);
77        let mut text = String::new();
78
79        for event in parser {
80            match event {
81                pulldown_cmark::Event::Start(pulldown_cmark::Tag::Heading { level, .. }) => {
82                    text.push('\n');
83                    let level_str = match level {
84                        pulldown_cmark::HeadingLevel::H1 => "# ",
85                        pulldown_cmark::HeadingLevel::H2 => "## ",
86                        pulldown_cmark::HeadingLevel::H3 => "### ",
87                        pulldown_cmark::HeadingLevel::H4 => "#### ",
88                        pulldown_cmark::HeadingLevel::H5 => "##### ",
89                        pulldown_cmark::HeadingLevel::H6 => "###### ",
90                    };
91                    text.push_str(level_str);
92                }
93                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
94                | pulldown_cmark::Event::SoftBreak
95                | pulldown_cmark::Event::HardBreak => {
96                    text.push('\n');
97                }
98                pulldown_cmark::Event::Text(t) | pulldown_cmark::Event::Code(t) => {
99                    text.push_str(&t);
100                }
101                pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph) => {
102                    text.push_str("\n\n");
103                }
104                _ => {}
105            }
106        }
107
108        Ok(text::normalize(&text))
109    }
110
111    fn supported_extensions(&self) -> &[&str] {
112        &["md", "markdown"]
113    }
114}
115
116/// Plain text parser
117struct TextParser;
118
119impl Parser for TextParser {
120    fn parse(&self, path: &Path) -> Result<String> {
121        let content = std::fs::read_to_string(path)?;
122        Ok(text::normalize(&content))
123    }
124
125    fn supported_extensions(&self) -> &[&str] {
126        &["txt", "text"]
127    }
128}
129
130/// PDF parser using pdf-extract
131struct PdfParser;
132
133impl Parser for PdfParser {
134    fn parse(&self, path: &Path) -> Result<String> {
135        let bytes = std::fs::read(path)?;
136        let text = pdf_extract::extract_text_from_mem(&bytes)
137            .map_err(|e| CPError::Parse(format!("PDF extraction failed: {e}")))?;
138        Ok(text::normalize(&text))
139    }
140
141    fn supported_extensions(&self) -> &[&str] {
142        &["pdf"]
143    }
144}
145
146/// Word document parser (.docx) using dotext
147struct DocxParser;
148
149impl Parser for DocxParser {
150    fn parse(&self, path: &Path) -> Result<String> {
151        use dotext::MsDoc;
152        use std::io::Read;
153
154        let mut doc: dotext::Docx = dotext::Docx::open(path)
155            .map_err(|e| CPError::Parse(format!("DOCX open failed: {e}")))?;
156        let mut content = String::new();
157        doc.read_to_string(&mut content)
158            .map_err(|e| CPError::Parse(format!("DOCX read failed: {e}")))?;
159        Ok(text::normalize(&content))
160    }
161
162    fn supported_extensions(&self) -> &[&str] {
163        &["docx"]
164    }
165}
166
167#[cfg(test)]
168mod tests {
169    use super::*;
170
171    #[test]
172    fn test_registry_finds_markdown() {
173        let registry = ParserRegistry::new();
174        assert!(registry.find_parser("md").is_some());
175        assert!(registry.find_parser("markdown").is_some());
176    }
177
178    #[test]
179    fn test_registry_finds_pdf() {
180        let registry = ParserRegistry::new();
181        assert!(registry.find_parser("pdf").is_some());
182    }
183
184    #[test]
185    fn test_unknown_extension() {
186        let registry = ParserRegistry::new();
187        assert!(registry.find_parser("xyz").is_none());
188    }
189}