Skip to main content

solo_storage/document/
parse.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Parse a document file by path: detect format, return normalized UTF-8 text.
4//!
5//! Format detection is by file extension only (no magic-byte sniffing). The
6//! allow-list in [`ALLOWED`] is the source of truth for which file types
7//! Solo accepts via `solo ingest`; anything outside it errors with
8//! [`ParseError::UnsupportedExtension`].
9//!
10//! ## Backends
11//!
12//! - Plaintext / markdown / source code → `std::fs::read_to_string` (must be
13//!   valid UTF-8; latin-1 / shift-jis / etc. are rejected, matching the
14//!   storage layer's UTF-8-only invariant).
15//! - PDF → [`pdf_extract::extract_text`] (pure-Rust, no C deps; quality is
16//!   acceptable for text-bearing PDFs but degrades on scanned / image-only
17//!   PDFs — see ADR-0003 / risk #1 in 0083).
18//! - HTML → [`html2text::from_read`] with a deliberately huge wrap width
19//!   (80 000 cols) so the chunker isn't fed artificial line-breaks.
20
21use std::path::Path;
22
23/// What [`parse_file`] returns on success.
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub struct ParsedDocument {
26    pub text: String,
27    pub mime_type: String,
28    pub byte_size: u64,
29}
30
31/// Errors surfaced from [`parse_file`].
32#[derive(Debug, thiserror::Error)]
33pub enum ParseError {
34    #[error("unsupported extension: {0}")]
35    UnsupportedExtension(String),
36
37    #[error("file is not valid UTF-8: {0}")]
38    InvalidUtf8(#[from] std::string::FromUtf8Error),
39
40    #[error("io error: {0}")]
41    Io(#[from] std::io::Error),
42
43    #[error("PDF parse error: {0}")]
44    Pdf(String),
45
46    #[error("HTML parse error: {0}")]
47    Html(String),
48
49    #[error("file is empty")]
50    Empty,
51}
52
53/// Allow-listed extensions and their MIME types.
54///
55/// Anything outside this list returns [`ParseError::UnsupportedExtension`].
56/// Extension matching is case-insensitive (lower-cased before lookup) so
57/// `README.MD` and `Doc.PDF` work.
58///
59/// Keep this in sync with `default_allowed_extensions()` in
60/// `crate::config::DocumentConfig`.
61pub(crate) const ALLOWED: &[(&str, &str)] = &[
62    ("md", "text/markdown"),
63    ("markdown", "text/markdown"),
64    ("txt", "text/plain"),
65    ("rs", "text/x-rust"),
66    ("py", "text/x-python"),
67    ("toml", "application/toml"),
68    ("yaml", "application/yaml"),
69    ("yml", "application/yaml"),
70    ("json", "application/json"),
71    ("pdf", "application/pdf"),
72    ("html", "text/html"),
73    ("htm", "text/html"),
74];
75
76/// Parse a file at `path`. Returns the normalized text + mime_type + raw byte
77/// size of the source file (which is NOT the same as `text.len()` for PDF /
78/// HTML — those backends transform the input).
79pub fn parse_file(path: &Path) -> Result<ParsedDocument, ParseError> {
80    let ext = path
81        .extension()
82        .and_then(|e| e.to_str())
83        .map(|s| s.to_ascii_lowercase())
84        .ok_or_else(|| ParseError::UnsupportedExtension(String::from("(no extension)")))?;
85
86    let mime = ALLOWED
87        .iter()
88        .find(|(e, _)| *e == ext)
89        .map(|(_, m)| *m)
90        .ok_or_else(|| ParseError::UnsupportedExtension(ext.clone()))?;
91
92    let byte_size = std::fs::metadata(path)?.len();
93
94    let text = match mime {
95        "application/pdf" => parse_pdf(path)?,
96        "text/html" => parse_html(path)?,
97        _ => parse_plaintext(path)?,
98    };
99
100    if text.trim().is_empty() {
101        return Err(ParseError::Empty);
102    }
103
104    Ok(ParsedDocument {
105        text,
106        mime_type: mime.to_string(),
107        byte_size,
108    })
109}
110
111fn parse_plaintext(path: &Path) -> Result<String, ParseError> {
112    let bytes = std::fs::read(path)?;
113    Ok(String::from_utf8(bytes)?)
114}
115
116fn parse_pdf(path: &Path) -> Result<String, ParseError> {
117    pdf_extract::extract_text(path).map_err(|e| ParseError::Pdf(format!("{e}")))
118}
119
120fn parse_html(path: &Path) -> Result<String, ParseError> {
121    let html = std::fs::read_to_string(path)?;
122    // html2text wraps lines at `width` columns; pick a huge width so the
123    // chunker isn't fed artificial line breaks that distort paragraph
124    // boundaries. `from_read(input, width) -> Result<String, Error>` since
125    // html2text 0.13.
126    html2text::from_read(html.as_bytes(), 80_000).map_err(|e| ParseError::Html(format!("{e}")))
127}
128
129// ---------------------------------------------------------------------------
130// Tests
131// ---------------------------------------------------------------------------
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136    use std::io::Write;
137    use tempfile::TempDir;
138
139    fn write_file(dir: &TempDir, name: &str, body: &[u8]) -> std::path::PathBuf {
140        let path = dir.path().join(name);
141        let mut f = std::fs::File::create(&path).unwrap();
142        f.write_all(body).unwrap();
143        f.sync_all().unwrap();
144        path
145    }
146
147    #[test]
148    fn parse_markdown_file_returns_text() {
149        let tmp = TempDir::new().unwrap();
150        let body = "# Hello\n\nThis is a markdown file.";
151        let path = write_file(&tmp, "note.md", body.as_bytes());
152
153        let out = parse_file(&path).unwrap();
154        assert_eq!(out.text, body);
155        assert_eq!(out.mime_type, "text/markdown");
156        assert_eq!(out.byte_size, body.len() as u64);
157    }
158
159    #[test]
160    fn parse_plain_text_file() {
161        let tmp = TempDir::new().unwrap();
162        let body = "Hello world.\n";
163        let path = write_file(&tmp, "x.txt", body.as_bytes());
164        let out = parse_file(&path).unwrap();
165        assert_eq!(out.text, body);
166        assert_eq!(out.mime_type, "text/plain");
167    }
168
169    #[test]
170    fn parse_rust_source() {
171        let tmp = TempDir::new().unwrap();
172        let body = "fn main() {\n    println!(\"hi\");\n}\n";
173        let path = write_file(&tmp, "main.rs", body.as_bytes());
174        let out = parse_file(&path).unwrap();
175        assert_eq!(out.text, body);
176        assert_eq!(out.mime_type, "text/x-rust");
177    }
178
179    #[test]
180    fn parse_uppercase_extension_is_accepted() {
181        // README.MD should work — extension matching is case-insensitive.
182        let tmp = TempDir::new().unwrap();
183        let body = "# upper";
184        let path = write_file(&tmp, "README.MD", body.as_bytes());
185        let out = parse_file(&path).unwrap();
186        assert_eq!(out.mime_type, "text/markdown");
187    }
188
189    #[test]
190    fn parse_html_strips_tags() {
191        let tmp = TempDir::new().unwrap();
192        // html2text strips `<script>` content by default; the visible "hello"
193        // must survive but the script body must not appear as text.
194        let body = "<html><body><p>hello world</p><script>var x = 'nope';</script></body></html>";
195        let path = write_file(&tmp, "page.html", body.as_bytes());
196        let out = parse_file(&path).unwrap();
197        assert!(
198            out.text.contains("hello world"),
199            "expected 'hello world' in: {:?}",
200            out.text
201        );
202        assert!(
203            !out.text.contains("nope"),
204            "script body should not appear in text: {:?}",
205            out.text
206        );
207        assert_eq!(out.mime_type, "text/html");
208    }
209
210    #[test]
211    fn parse_unsupported_extension_errors() {
212        let tmp = TempDir::new().unwrap();
213        let path = write_file(&tmp, "blob.bin", b"\x00\x01\x02");
214        let err = parse_file(&path).unwrap_err();
215        match err {
216            ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "bin"),
217            other => panic!("expected UnsupportedExtension, got {other:?}"),
218        }
219    }
220
221    #[test]
222    fn parse_file_without_extension_errors() {
223        let tmp = TempDir::new().unwrap();
224        let path = write_file(&tmp, "noext", b"hello");
225        let err = parse_file(&path).unwrap_err();
226        match err {
227            ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "(no extension)"),
228            other => panic!("expected UnsupportedExtension, got {other:?}"),
229        }
230    }
231
232    #[test]
233    fn parse_empty_file_errors_with_empty_variant() {
234        let tmp = TempDir::new().unwrap();
235        let path = write_file(&tmp, "empty.txt", b"");
236        let err = parse_file(&path).unwrap_err();
237        assert!(matches!(err, ParseError::Empty), "got: {err:?}");
238    }
239
240    #[test]
241    fn parse_whitespace_only_file_errors_with_empty_variant() {
242        // A file containing only whitespace should also be treated as empty —
243        // there's nothing to chunk or embed.
244        let tmp = TempDir::new().unwrap();
245        let path = write_file(&tmp, "ws.txt", b"   \n\t\n  \n");
246        let err = parse_file(&path).unwrap_err();
247        assert!(matches!(err, ParseError::Empty), "got: {err:?}");
248    }
249
250    #[test]
251    fn parse_returns_byte_size_correctly() {
252        let tmp = TempDir::new().unwrap();
253        let body = b"abcdefghij"; // 10 bytes
254        let path = write_file(&tmp, "sized.txt", body);
255        let out = parse_file(&path).unwrap();
256        assert_eq!(out.byte_size, 10);
257    }
258
259    #[test]
260    fn parse_invalid_utf8_errors() {
261        // Non-UTF8 bytes in a .txt file should surface as InvalidUtf8, not
262        // panic and not silently lossy-decode.
263        let tmp = TempDir::new().unwrap();
264        let path = write_file(&tmp, "bad.txt", &[0xff, 0xfe, 0xfd]);
265        let err = parse_file(&path).unwrap_err();
266        assert!(matches!(err, ParseError::InvalidUtf8(_)), "got: {err:?}");
267    }
268
269    /// A minimal valid PDF 1.4 byte stream containing the literal text
270    /// "Hello PDF". Generated inline (no binary fixture commit) so the
271    /// test is self-contained and reproducible across platforms.
272    ///
273    /// Structure: 1 catalog, 1 pages, 1 page object, 1 font, 1 content
274    /// stream. Total ~500 bytes. Built with manual xref offsets — the
275    /// numbers below are fragile; if any object body changes, recompute
276    /// `xref` positions.
277    fn minimal_pdf() -> Vec<u8> {
278        // Build the body first; xref offsets are computed post-hoc.
279        let objects: [&str; 5] = [
280            "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
281            "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
282            "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
283             /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\nendobj\n",
284            "4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n",
285            "5 0 obj\n<< /Length 44 >>\nstream\nBT\n/F1 24 Tf\n72 720 Td\n(Hello PDF) Tj\nET\nendstream\nendobj\n",
286        ];
287
288        let mut buf = Vec::new();
289        buf.extend_from_slice(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"); // header + binary marker
290        let mut offsets: Vec<usize> = Vec::with_capacity(objects.len());
291        for obj in &objects {
292            offsets.push(buf.len());
293            buf.extend_from_slice(obj.as_bytes());
294        }
295        let xref_offset = buf.len();
296        buf.extend_from_slice(format!("xref\n0 {}\n", objects.len() + 1).as_bytes());
297        buf.extend_from_slice(b"0000000000 65535 f \n");
298        for off in &offsets {
299            buf.extend_from_slice(format!("{:010} 00000 n \n", off).as_bytes());
300        }
301        buf.extend_from_slice(
302            format!(
303                "trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
304                objects.len() + 1,
305                xref_offset
306            )
307            .as_bytes(),
308        );
309        buf
310    }
311
312    #[test]
313    fn parse_pdf_extracts_known_text() {
314        // Generate a tiny synthetic PDF at test time so we don't commit a
315        // binary fixture. pdf-extract's text extraction is best-effort; if
316        // it fails on this platform we log + continue (don't fail the
317        // build). The build-blocking guarantee is that `parse_file` does
318        // not panic and routes through `parse_pdf`.
319        let tmp = TempDir::new().unwrap();
320        let path = write_file(&tmp, "hello.pdf", &minimal_pdf());
321
322        match parse_file(&path) {
323            Ok(out) => {
324                assert_eq!(out.mime_type, "application/pdf");
325                // The fixture's content is the literal string "Hello PDF".
326                // pdf-extract may return the text with or without surrounding
327                // whitespace; be lenient.
328                assert!(
329                    out.text.to_lowercase().contains("hello"),
330                    "extracted text missing 'hello': {:?}",
331                    out.text
332                );
333            }
334            Err(ParseError::Empty) => {
335                // pdf-extract found no extractable text in our minimal PDF —
336                // acceptable for this hand-crafted fixture. Log + pass.
337                eprintln!("parse_pdf: extracted text was empty (acceptable for minimal fixture)");
338            }
339            Err(ParseError::Pdf(msg)) => {
340                eprintln!(
341                    "parse_pdf: pdf-extract rejected minimal fixture (acceptable): {msg}"
342                );
343            }
344            Err(other) => panic!("parse_pdf: unexpected error variant: {other:?}"),
345        }
346    }
347}