1use cp_core::{text, CPError, Result};
2use std::path::Path;
3
4pub trait Parser: Send + Sync {
6 fn parse(&self, path: &Path) -> Result<String>;
8
9 fn supported_extensions(&self) -> &[&str];
11}
12
13pub struct ParserRegistry {
15 parsers: Vec<Box<dyn Parser>>,
16}
17
18impl Default for ParserRegistry {
19 fn default() -> Self {
20 Self::new()
21 }
22}
23
24impl ParserRegistry {
25 pub fn new() -> Self {
27 Self {
28 parsers: vec![
29 Box::new(MarkdownParser),
30 Box::new(TextParser),
31 Box::new(PdfParser),
32 Box::new(DocxParser),
33 ],
34 }
35 }
36
37 pub fn find_parser(&self, extension: &str) -> Option<&dyn Parser> {
39 for parser in &self.parsers {
40 if parser
41 .supported_extensions()
42 .iter()
43 .any(|e| e.eq_ignore_ascii_case(extension))
44 {
45 return Some(parser.as_ref());
46 }
47 }
48 None
49 }
50}
51
52pub fn parse_file(path: &Path) -> Result<String> {
54 let registry = ParserRegistry::new();
55
56 let extension = path
57 .extension()
58 .and_then(|e| e.to_str())
59 .ok_or_else(|| CPError::Parse("No file extension".into()))?;
60
61 let parser = registry
62 .find_parser(extension)
63 .ok_or_else(|| CPError::Parse(format!("No parser for extension: {extension}")))?;
64
65 parser.parse(path)
66}
67
68struct MarkdownParser;
70
71impl Parser for MarkdownParser {
72 fn parse(&self, path: &Path) -> Result<String> {
73 let content = std::fs::read_to_string(path)?;
74
75 let parser = pulldown_cmark::Parser::new(&content);
77 let mut text = String::new();
78
79 for event in parser {
80 match event {
81 pulldown_cmark::Event::Start(pulldown_cmark::Tag::Heading { level, .. }) => {
82 text.push('\n');
83 let level_str = match level {
84 pulldown_cmark::HeadingLevel::H1 => "# ",
85 pulldown_cmark::HeadingLevel::H2 => "## ",
86 pulldown_cmark::HeadingLevel::H3 => "### ",
87 pulldown_cmark::HeadingLevel::H4 => "#### ",
88 pulldown_cmark::HeadingLevel::H5 => "##### ",
89 pulldown_cmark::HeadingLevel::H6 => "###### ",
90 };
91 text.push_str(level_str);
92 }
93 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_))
94 | pulldown_cmark::Event::SoftBreak
95 | pulldown_cmark::Event::HardBreak => {
96 text.push('\n');
97 }
98 pulldown_cmark::Event::Text(t) | pulldown_cmark::Event::Code(t) => {
99 text.push_str(&t);
100 }
101 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph) => {
102 text.push_str("\n\n");
103 }
104 _ => {}
105 }
106 }
107
108 Ok(text::normalize(&text))
109 }
110
111 fn supported_extensions(&self) -> &[&str] {
112 &["md", "markdown"]
113 }
114}
115
116struct TextParser;
118
119impl Parser for TextParser {
120 fn parse(&self, path: &Path) -> Result<String> {
121 let content = std::fs::read_to_string(path)?;
122 Ok(text::normalize(&content))
123 }
124
125 fn supported_extensions(&self) -> &[&str] {
126 &["txt", "text"]
127 }
128}
129
130struct PdfParser;
132
133impl Parser for PdfParser {
134 fn parse(&self, path: &Path) -> Result<String> {
135 let bytes = std::fs::read(path)?;
136 let text = pdf_extract::extract_text_from_mem(&bytes)
137 .map_err(|e| CPError::Parse(format!("PDF extraction failed: {e}")))?;
138 Ok(text::normalize(&text))
139 }
140
141 fn supported_extensions(&self) -> &[&str] {
142 &["pdf"]
143 }
144}
145
146struct DocxParser;
148
149impl Parser for DocxParser {
150 fn parse(&self, path: &Path) -> Result<String> {
151 use dotext::MsDoc;
152 use std::io::Read;
153
154 let mut doc: dotext::Docx = dotext::Docx::open(path)
155 .map_err(|e| CPError::Parse(format!("DOCX open failed: {e}")))?;
156 let mut content = String::new();
157 doc.read_to_string(&mut content)
158 .map_err(|e| CPError::Parse(format!("DOCX read failed: {e}")))?;
159 Ok(text::normalize(&content))
160 }
161
162 fn supported_extensions(&self) -> &[&str] {
163 &["docx"]
164 }
165}
166
167#[cfg(test)]
168mod tests {
169 use super::*;
170
171 #[test]
172 fn test_registry_finds_markdown() {
173 let registry = ParserRegistry::new();
174 assert!(registry.find_parser("md").is_some());
175 assert!(registry.find_parser("markdown").is_some());
176 }
177
178 #[test]
179 fn test_registry_finds_pdf() {
180 let registry = ParserRegistry::new();
181 assert!(registry.find_parser("pdf").is_some());
182 }
183
184 #[test]
185 fn test_unknown_extension() {
186 let registry = ParserRegistry::new();
187 assert!(registry.find_parser("xyz").is_none());
188 }
189}