1use std::path::Path;
22
23#[derive(Debug, Clone, PartialEq, Eq)]
25pub struct ParsedDocument {
26 pub text: String,
27 pub mime_type: String,
28 pub byte_size: u64,
29}
30
31#[derive(Debug, thiserror::Error)]
33pub enum ParseError {
34 #[error("unsupported extension: {0}")]
35 UnsupportedExtension(String),
36
37 #[error("file is not valid UTF-8: {0}")]
38 InvalidUtf8(#[from] std::string::FromUtf8Error),
39
40 #[error("io error: {0}")]
41 Io(#[from] std::io::Error),
42
43 #[error("PDF parse error: {0}")]
44 Pdf(String),
45
46 #[error("HTML parse error: {0}")]
47 Html(String),
48
49 #[error("file is empty")]
50 Empty,
51}
52
53pub(crate) const ALLOWED: &[(&str, &str)] = &[
62 ("md", "text/markdown"),
63 ("markdown", "text/markdown"),
64 ("txt", "text/plain"),
65 ("rs", "text/x-rust"),
66 ("py", "text/x-python"),
67 ("toml", "application/toml"),
68 ("yaml", "application/yaml"),
69 ("yml", "application/yaml"),
70 ("json", "application/json"),
71 ("pdf", "application/pdf"),
72 ("html", "text/html"),
73 ("htm", "text/html"),
74];
75
76pub fn parse_file(path: &Path) -> Result<ParsedDocument, ParseError> {
80 let ext = path
81 .extension()
82 .and_then(|e| e.to_str())
83 .map(|s| s.to_ascii_lowercase())
84 .ok_or_else(|| ParseError::UnsupportedExtension(String::from("(no extension)")))?;
85
86 let mime = ALLOWED
87 .iter()
88 .find(|(e, _)| *e == ext)
89 .map(|(_, m)| *m)
90 .ok_or_else(|| ParseError::UnsupportedExtension(ext.clone()))?;
91
92 let byte_size = std::fs::metadata(path)?.len();
93
94 let text = match mime {
95 "application/pdf" => parse_pdf(path)?,
96 "text/html" => parse_html(path)?,
97 _ => parse_plaintext(path)?,
98 };
99
100 if text.trim().is_empty() {
101 return Err(ParseError::Empty);
102 }
103
104 Ok(ParsedDocument {
105 text,
106 mime_type: mime.to_string(),
107 byte_size,
108 })
109}
110
111fn parse_plaintext(path: &Path) -> Result<String, ParseError> {
112 let bytes = std::fs::read(path)?;
113 Ok(String::from_utf8(bytes)?)
114}
115
116fn parse_pdf(path: &Path) -> Result<String, ParseError> {
117 pdf_extract::extract_text(path).map_err(|e| ParseError::Pdf(format!("{e}")))
118}
119
120fn parse_html(path: &Path) -> Result<String, ParseError> {
121 let html = std::fs::read_to_string(path)?;
122 html2text::from_read(html.as_bytes(), 80_000).map_err(|e| ParseError::Html(format!("{e}")))
127}
128
129#[cfg(test)]
134mod tests {
135 use super::*;
136 use std::io::Write;
137 use tempfile::TempDir;
138
139 fn write_file(dir: &TempDir, name: &str, body: &[u8]) -> std::path::PathBuf {
140 let path = dir.path().join(name);
141 let mut f = std::fs::File::create(&path).unwrap();
142 f.write_all(body).unwrap();
143 f.sync_all().unwrap();
144 path
145 }
146
147 #[test]
148 fn parse_markdown_file_returns_text() {
149 let tmp = TempDir::new().unwrap();
150 let body = "# Hello\n\nThis is a markdown file.";
151 let path = write_file(&tmp, "note.md", body.as_bytes());
152
153 let out = parse_file(&path).unwrap();
154 assert_eq!(out.text, body);
155 assert_eq!(out.mime_type, "text/markdown");
156 assert_eq!(out.byte_size, body.len() as u64);
157 }
158
159 #[test]
160 fn parse_plain_text_file() {
161 let tmp = TempDir::new().unwrap();
162 let body = "Hello world.\n";
163 let path = write_file(&tmp, "x.txt", body.as_bytes());
164 let out = parse_file(&path).unwrap();
165 assert_eq!(out.text, body);
166 assert_eq!(out.mime_type, "text/plain");
167 }
168
169 #[test]
170 fn parse_rust_source() {
171 let tmp = TempDir::new().unwrap();
172 let body = "fn main() {\n println!(\"hi\");\n}\n";
173 let path = write_file(&tmp, "main.rs", body.as_bytes());
174 let out = parse_file(&path).unwrap();
175 assert_eq!(out.text, body);
176 assert_eq!(out.mime_type, "text/x-rust");
177 }
178
179 #[test]
180 fn parse_uppercase_extension_is_accepted() {
181 let tmp = TempDir::new().unwrap();
183 let body = "# upper";
184 let path = write_file(&tmp, "README.MD", body.as_bytes());
185 let out = parse_file(&path).unwrap();
186 assert_eq!(out.mime_type, "text/markdown");
187 }
188
189 #[test]
190 fn parse_html_strips_tags() {
191 let tmp = TempDir::new().unwrap();
192 let body = "<html><body><p>hello world</p><script>var x = 'nope';</script></body></html>";
195 let path = write_file(&tmp, "page.html", body.as_bytes());
196 let out = parse_file(&path).unwrap();
197 assert!(
198 out.text.contains("hello world"),
199 "expected 'hello world' in: {:?}",
200 out.text
201 );
202 assert!(
203 !out.text.contains("nope"),
204 "script body should not appear in text: {:?}",
205 out.text
206 );
207 assert_eq!(out.mime_type, "text/html");
208 }
209
210 #[test]
211 fn parse_unsupported_extension_errors() {
212 let tmp = TempDir::new().unwrap();
213 let path = write_file(&tmp, "blob.bin", b"\x00\x01\x02");
214 let err = parse_file(&path).unwrap_err();
215 match err {
216 ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "bin"),
217 other => panic!("expected UnsupportedExtension, got {other:?}"),
218 }
219 }
220
221 #[test]
222 fn parse_file_without_extension_errors() {
223 let tmp = TempDir::new().unwrap();
224 let path = write_file(&tmp, "noext", b"hello");
225 let err = parse_file(&path).unwrap_err();
226 match err {
227 ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "(no extension)"),
228 other => panic!("expected UnsupportedExtension, got {other:?}"),
229 }
230 }
231
232 #[test]
233 fn parse_empty_file_errors_with_empty_variant() {
234 let tmp = TempDir::new().unwrap();
235 let path = write_file(&tmp, "empty.txt", b"");
236 let err = parse_file(&path).unwrap_err();
237 assert!(matches!(err, ParseError::Empty), "got: {err:?}");
238 }
239
240 #[test]
241 fn parse_whitespace_only_file_errors_with_empty_variant() {
242 let tmp = TempDir::new().unwrap();
245 let path = write_file(&tmp, "ws.txt", b" \n\t\n \n");
246 let err = parse_file(&path).unwrap_err();
247 assert!(matches!(err, ParseError::Empty), "got: {err:?}");
248 }
249
250 #[test]
251 fn parse_returns_byte_size_correctly() {
252 let tmp = TempDir::new().unwrap();
253 let body = b"abcdefghij"; let path = write_file(&tmp, "sized.txt", body);
255 let out = parse_file(&path).unwrap();
256 assert_eq!(out.byte_size, 10);
257 }
258
259 #[test]
260 fn parse_invalid_utf8_errors() {
261 let tmp = TempDir::new().unwrap();
264 let path = write_file(&tmp, "bad.txt", &[0xff, 0xfe, 0xfd]);
265 let err = parse_file(&path).unwrap_err();
266 assert!(matches!(err, ParseError::InvalidUtf8(_)), "got: {err:?}");
267 }
268
269 fn minimal_pdf() -> Vec<u8> {
278 let objects: [&str; 5] = [
280 "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
281 "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
282 "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
283 /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\nendobj\n",
284 "4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n",
285 "5 0 obj\n<< /Length 44 >>\nstream\nBT\n/F1 24 Tf\n72 720 Td\n(Hello PDF) Tj\nET\nendstream\nendobj\n",
286 ];
287
288 let mut buf = Vec::new();
289 buf.extend_from_slice(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"); let mut offsets: Vec<usize> = Vec::with_capacity(objects.len());
291 for obj in &objects {
292 offsets.push(buf.len());
293 buf.extend_from_slice(obj.as_bytes());
294 }
295 let xref_offset = buf.len();
296 buf.extend_from_slice(format!("xref\n0 {}\n", objects.len() + 1).as_bytes());
297 buf.extend_from_slice(b"0000000000 65535 f \n");
298 for off in &offsets {
299 buf.extend_from_slice(format!("{:010} 00000 n \n", off).as_bytes());
300 }
301 buf.extend_from_slice(
302 format!(
303 "trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
304 objects.len() + 1,
305 xref_offset
306 )
307 .as_bytes(),
308 );
309 buf
310 }
311
312 #[test]
313 fn parse_pdf_extracts_known_text() {
314 let tmp = TempDir::new().unwrap();
320 let path = write_file(&tmp, "hello.pdf", &minimal_pdf());
321
322 match parse_file(&path) {
323 Ok(out) => {
324 assert_eq!(out.mime_type, "application/pdf");
325 assert!(
329 out.text.to_lowercase().contains("hello"),
330 "extracted text missing 'hello': {:?}",
331 out.text
332 );
333 }
334 Err(ParseError::Empty) => {
335 eprintln!("parse_pdf: extracted text was empty (acceptable for minimal fixture)");
338 }
339 Err(ParseError::Pdf(msg)) => {
340 eprintln!(
341 "parse_pdf: pdf-extract rejected minimal fixture (acceptable): {msg}"
342 );
343 }
344 Err(other) => panic!("parse_pdf: unexpected error variant: {other:?}"),
345 }
346 }
347}