use std::path::Path;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParsedDocument {
pub text: String,
pub mime_type: String,
pub byte_size: u64,
}
#[derive(Debug, thiserror::Error)]
pub enum ParseError {
#[error("unsupported extension: {0}")]
UnsupportedExtension(String),
#[error("file is not valid UTF-8: {0}")]
InvalidUtf8(#[from] std::string::FromUtf8Error),
#[error("io error: {0}")]
Io(#[from] std::io::Error),
#[error("PDF parse error: {0}")]
Pdf(String),
#[error("HTML parse error: {0}")]
Html(String),
#[error("file is empty")]
Empty,
}
pub(crate) const ALLOWED: &[(&str, &str)] = &[
("md", "text/markdown"),
("markdown", "text/markdown"),
("txt", "text/plain"),
("rs", "text/x-rust"),
("py", "text/x-python"),
("toml", "application/toml"),
("yaml", "application/yaml"),
("yml", "application/yaml"),
("json", "application/json"),
("pdf", "application/pdf"),
("html", "text/html"),
("htm", "text/html"),
];
pub fn parse_file(path: &Path) -> Result<ParsedDocument, ParseError> {
let ext = path
.extension()
.and_then(|e| e.to_str())
.map(|s| s.to_ascii_lowercase())
.ok_or_else(|| ParseError::UnsupportedExtension(String::from("(no extension)")))?;
let mime = ALLOWED
.iter()
.find(|(e, _)| *e == ext)
.map(|(_, m)| *m)
.ok_or_else(|| ParseError::UnsupportedExtension(ext.clone()))?;
let byte_size = std::fs::metadata(path)?.len();
let text = match mime {
"application/pdf" => parse_pdf(path)?,
"text/html" => parse_html(path)?,
_ => parse_plaintext(path)?,
};
if text.trim().is_empty() {
return Err(ParseError::Empty);
}
Ok(ParsedDocument {
text,
mime_type: mime.to_string(),
byte_size,
})
}
fn parse_plaintext(path: &Path) -> Result<String, ParseError> {
let bytes = std::fs::read(path)?;
Ok(String::from_utf8(bytes)?)
}
fn parse_pdf(path: &Path) -> Result<String, ParseError> {
pdf_extract::extract_text(path).map_err(|e| ParseError::Pdf(format!("{e}")))
}
fn parse_html(path: &Path) -> Result<String, ParseError> {
let html = std::fs::read_to_string(path)?;
html2text::from_read(html.as_bytes(), 80_000).map_err(|e| ParseError::Html(format!("{e}")))
}
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
use tempfile::TempDir;
fn write_file(dir: &TempDir, name: &str, body: &[u8]) -> std::path::PathBuf {
let path = dir.path().join(name);
let mut f = std::fs::File::create(&path).unwrap();
f.write_all(body).unwrap();
f.sync_all().unwrap();
path
}
#[test]
fn parse_markdown_file_returns_text() {
let tmp = TempDir::new().unwrap();
let body = "# Hello\n\nThis is a markdown file.";
let path = write_file(&tmp, "note.md", body.as_bytes());
let out = parse_file(&path).unwrap();
assert_eq!(out.text, body);
assert_eq!(out.mime_type, "text/markdown");
assert_eq!(out.byte_size, body.len() as u64);
}
#[test]
fn parse_plain_text_file() {
let tmp = TempDir::new().unwrap();
let body = "Hello world.\n";
let path = write_file(&tmp, "x.txt", body.as_bytes());
let out = parse_file(&path).unwrap();
assert_eq!(out.text, body);
assert_eq!(out.mime_type, "text/plain");
}
#[test]
fn parse_rust_source() {
let tmp = TempDir::new().unwrap();
let body = "fn main() {\n println!(\"hi\");\n}\n";
let path = write_file(&tmp, "main.rs", body.as_bytes());
let out = parse_file(&path).unwrap();
assert_eq!(out.text, body);
assert_eq!(out.mime_type, "text/x-rust");
}
#[test]
fn parse_uppercase_extension_is_accepted() {
let tmp = TempDir::new().unwrap();
let body = "# upper";
let path = write_file(&tmp, "README.MD", body.as_bytes());
let out = parse_file(&path).unwrap();
assert_eq!(out.mime_type, "text/markdown");
}
#[test]
fn parse_html_strips_tags() {
let tmp = TempDir::new().unwrap();
let body = "<html><body><p>hello world</p><script>var x = 'nope';</script></body></html>";
let path = write_file(&tmp, "page.html", body.as_bytes());
let out = parse_file(&path).unwrap();
assert!(
out.text.contains("hello world"),
"expected 'hello world' in: {:?}",
out.text
);
assert!(
!out.text.contains("nope"),
"script body should not appear in text: {:?}",
out.text
);
assert_eq!(out.mime_type, "text/html");
}
#[test]
fn parse_unsupported_extension_errors() {
let tmp = TempDir::new().unwrap();
let path = write_file(&tmp, "blob.bin", b"\x00\x01\x02");
let err = parse_file(&path).unwrap_err();
match err {
ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "bin"),
other => panic!("expected UnsupportedExtension, got {other:?}"),
}
}
#[test]
fn parse_file_without_extension_errors() {
let tmp = TempDir::new().unwrap();
let path = write_file(&tmp, "noext", b"hello");
let err = parse_file(&path).unwrap_err();
match err {
ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "(no extension)"),
other => panic!("expected UnsupportedExtension, got {other:?}"),
}
}
#[test]
fn parse_empty_file_errors_with_empty_variant() {
let tmp = TempDir::new().unwrap();
let path = write_file(&tmp, "empty.txt", b"");
let err = parse_file(&path).unwrap_err();
assert!(matches!(err, ParseError::Empty), "got: {err:?}");
}
#[test]
fn parse_whitespace_only_file_errors_with_empty_variant() {
let tmp = TempDir::new().unwrap();
let path = write_file(&tmp, "ws.txt", b" \n\t\n \n");
let err = parse_file(&path).unwrap_err();
assert!(matches!(err, ParseError::Empty), "got: {err:?}");
}
#[test]
fn parse_returns_byte_size_correctly() {
let tmp = TempDir::new().unwrap();
let body = b"abcdefghij"; let path = write_file(&tmp, "sized.txt", body);
let out = parse_file(&path).unwrap();
assert_eq!(out.byte_size, 10);
}
#[test]
fn parse_invalid_utf8_errors() {
let tmp = TempDir::new().unwrap();
let path = write_file(&tmp, "bad.txt", &[0xff, 0xfe, 0xfd]);
let err = parse_file(&path).unwrap_err();
assert!(matches!(err, ParseError::InvalidUtf8(_)), "got: {err:?}");
}
fn minimal_pdf() -> Vec<u8> {
let objects: [&str; 5] = [
"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
"3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
/Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\nendobj\n",
"4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n",
"5 0 obj\n<< /Length 44 >>\nstream\nBT\n/F1 24 Tf\n72 720 Td\n(Hello PDF) Tj\nET\nendstream\nendobj\n",
];
let mut buf = Vec::new();
buf.extend_from_slice(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"); let mut offsets: Vec<usize> = Vec::with_capacity(objects.len());
for obj in &objects {
offsets.push(buf.len());
buf.extend_from_slice(obj.as_bytes());
}
let xref_offset = buf.len();
buf.extend_from_slice(format!("xref\n0 {}\n", objects.len() + 1).as_bytes());
buf.extend_from_slice(b"0000000000 65535 f \n");
for off in &offsets {
buf.extend_from_slice(format!("{:010} 00000 n \n", off).as_bytes());
}
buf.extend_from_slice(
format!(
"trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
objects.len() + 1,
xref_offset
)
.as_bytes(),
);
buf
}
#[test]
fn parse_pdf_extracts_known_text() {
let tmp = TempDir::new().unwrap();
let path = write_file(&tmp, "hello.pdf", &minimal_pdf());
match parse_file(&path) {
Ok(out) => {
assert_eq!(out.mime_type, "application/pdf");
assert!(
out.text.to_lowercase().contains("hello"),
"extracted text missing 'hello': {:?}",
out.text
);
}
Err(ParseError::Empty) => {
eprintln!("parse_pdf: extracted text was empty (acceptable for minimal fixture)");
}
Err(ParseError::Pdf(msg)) => {
eprintln!(
"parse_pdf: pdf-extract rejected minimal fixture (acceptable): {msg}"
);
}
Err(other) => panic!("parse_pdf: unexpected error variant: {other:?}"),
}
}
}