1use cp_core::{CPError, Result, text};
2use std::path::Path;
3
4pub trait Parser: Send + Sync {
6 fn parse(&self, path: &Path) -> Result<String>;
8
9 fn supported_extensions(&self) -> &[&str];
11}
12
13pub struct ParserRegistry {
15 parsers: Vec<Box<dyn Parser>>,
16}
17
18impl Default for ParserRegistry {
19 fn default() -> Self {
20 Self::new()
21 }
22}
23
24impl ParserRegistry {
25 pub fn new() -> Self {
27 Self {
28 parsers: vec![
29 Box::new(MarkdownParser),
30 Box::new(TextParser),
31 Box::new(PdfParser),
32 ],
33 }
34 }
35
36 pub fn find_parser(&self, extension: &str) -> Option<&dyn Parser> {
38 for parser in &self.parsers {
39 if parser
40 .supported_extensions()
41 .iter()
42 .any(|e| e.eq_ignore_ascii_case(extension))
43 {
44 return Some(parser.as_ref());
45 }
46 }
47 None
48 }
49}
50
51pub fn parse_file(path: &Path) -> Result<String> {
53 let registry = ParserRegistry::new();
54
55 let extension = path
56 .extension()
57 .and_then(|e| e.to_str())
58 .ok_or_else(|| CPError::Parse("No file extension".into()))?;
59
60 let parser = registry
61 .find_parser(extension)
62 .ok_or_else(|| CPError::Parse(format!("No parser for extension: {}", extension)))?;
63
64 parser.parse(path)
65}
66
67struct MarkdownParser;
69
70impl Parser for MarkdownParser {
71 fn parse(&self, path: &Path) -> Result<String> {
72 let content = std::fs::read_to_string(path)?;
73
74 let parser = pulldown_cmark::Parser::new(&content);
76 let mut text = String::new();
77
78 for event in parser {
79 match event {
80 pulldown_cmark::Event::Start(pulldown_cmark::Tag::Heading { level, .. }) => {
81 text.push('\n');
82 let level_str = match level {
83 pulldown_cmark::HeadingLevel::H1 => "# ",
84 pulldown_cmark::HeadingLevel::H2 => "## ",
85 pulldown_cmark::HeadingLevel::H3 => "### ",
86 pulldown_cmark::HeadingLevel::H4 => "#### ",
87 pulldown_cmark::HeadingLevel::H5 => "##### ",
88 pulldown_cmark::HeadingLevel::H6 => "###### ",
89 };
90 text.push_str(level_str);
91 }
92 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Heading(_)) => {
93 text.push('\n');
94 }
95 pulldown_cmark::Event::Text(t)
96 | pulldown_cmark::Event::Code(t) => {
97 text.push_str(&t);
98 }
99 pulldown_cmark::Event::SoftBreak
100 | pulldown_cmark::Event::HardBreak => {
101 text.push('\n');
102 }
103 pulldown_cmark::Event::End(pulldown_cmark::TagEnd::Paragraph) => {
104 text.push_str("\n\n");
105 }
106 _ => {}
107 }
108 }
109
110 Ok(text::normalize(&text))
111 }
112
113 fn supported_extensions(&self) -> &[&str] {
114 &["md", "markdown"]
115 }
116}
117
118struct TextParser;
120
121impl Parser for TextParser {
122 fn parse(&self, path: &Path) -> Result<String> {
123 let content = std::fs::read_to_string(path)?;
124 Ok(text::normalize(&content))
125 }
126
127 fn supported_extensions(&self) -> &[&str] {
128 &["txt", "text"]
129 }
130}
131
132struct PdfParser;
134
135impl Parser for PdfParser {
136 fn parse(&self, path: &Path) -> Result<String> {
137 let bytes = std::fs::read(path)?;
138 let text = pdf_extract::extract_text_from_mem(&bytes)
139 .map_err(|e| CPError::Parse(format!("PDF extraction failed: {}", e)))?;
140 Ok(text::normalize(&text))
141 }
142
143 fn supported_extensions(&self) -> &[&str] {
144 &["pdf"]
145 }
146}
147
148#[cfg(test)]
149mod tests {
150 use super::*;
151
152 #[test]
153 fn test_registry_finds_markdown() {
154 let registry = ParserRegistry::new();
155 assert!(registry.find_parser("md").is_some());
156 assert!(registry.find_parser("markdown").is_some());
157 }
158
159 #[test]
160 fn test_registry_finds_pdf() {
161 let registry = ParserRegistry::new();
162 assert!(registry.find_parser("pdf").is_some());
163 }
164
165 #[test]
166 fn test_unknown_extension() {
167 let registry = ParserRegistry::new();
168 assert!(registry.find_parser("xyz").is_none());
169 }
170}