solo-storage 0.10.2

// SPDX-License-Identifier: Apache-2.0

//! Parse a document file by path: detect format, return normalized UTF-8 text.
//!
//! Format detection is by file extension only (no magic-byte sniffing). The
//! allow-list in [`ALLOWED`] is the source of truth for which file types
//! Solo accepts via `solo ingest`; anything outside it errors with
//! [`ParseError::UnsupportedExtension`].
//!
//! ## Backends
//!
//! - Plaintext / markdown / source code → `std::fs::read_to_string` (must be
//!   valid UTF-8; latin-1 / shift-jis / etc. are rejected, matching the
//!   storage layer's UTF-8-only invariant).
//! - PDF → [`pdf_extract::extract_text`] (pure-Rust, no C deps; quality is
//!   acceptable for text-bearing PDFs but degrades on scanned / image-only
//!   PDFs — see ADR-0003 / risk #1 in 0083).
//! - HTML → [`html2text::from_read`] with a deliberately huge wrap width
//!   (80 000 cols) so the chunker isn't fed artificial line-breaks.

use std::path::Path;

/// What [`parse_file`] returns on success.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParsedDocument {
    pub text: String,
    pub mime_type: String,
    pub byte_size: u64,
}

/// Errors surfaced from [`parse_file`].
#[derive(Debug, thiserror::Error)]
pub enum ParseError {
    #[error("unsupported extension: {0}")]
    UnsupportedExtension(String),

    #[error("file is not valid UTF-8: {0}")]
    InvalidUtf8(#[from] std::string::FromUtf8Error),

    #[error("io error: {0}")]
    Io(#[from] std::io::Error),

    #[error("PDF parse error: {0}")]
    Pdf(String),

    #[error("HTML parse error: {0}")]
    Html(String),

    #[error("file is empty")]
    Empty,
}

/// Allow-listed extensions and their MIME types.
///
/// Anything outside this list returns [`ParseError::UnsupportedExtension`].
/// Extension matching is case-insensitive (lower-cased before lookup) so
/// `README.MD` and `Doc.PDF` work.
///
/// Keep this in sync with `default_allowed_extensions()` in
/// `crate::config::DocumentConfig`.
pub(crate) const ALLOWED: &[(&str, &str)] = &[
    ("md", "text/markdown"),
    ("markdown", "text/markdown"),
    ("txt", "text/plain"),
    ("rs", "text/x-rust"),
    ("py", "text/x-python"),
    ("toml", "application/toml"),
    ("yaml", "application/yaml"),
    ("yml", "application/yaml"),
    ("json", "application/json"),
    ("pdf", "application/pdf"),
    ("html", "text/html"),
    ("htm", "text/html"),
];

/// Parse a file at `path`. Returns the normalized text + mime_type + raw byte
/// size of the source file (which is NOT the same as `text.len()` for PDF /
/// HTML — those backends transform the input).
pub fn parse_file(path: &Path) -> Result<ParsedDocument, ParseError> {
    let ext = path
        .extension()
        .and_then(|e| e.to_str())
        .map(|s| s.to_ascii_lowercase())
        .ok_or_else(|| ParseError::UnsupportedExtension(String::from("(no extension)")))?;

    let mime = ALLOWED
        .iter()
        .find(|(e, _)| *e == ext)
        .map(|(_, m)| *m)
        .ok_or_else(|| ParseError::UnsupportedExtension(ext.clone()))?;

    let byte_size = std::fs::metadata(path)?.len();

    let text = match mime {
        "application/pdf" => parse_pdf(path)?,
        "text/html" => parse_html(path)?,
        _ => parse_plaintext(path)?,
    };

    if text.trim().is_empty() {
        return Err(ParseError::Empty);
    }

    Ok(ParsedDocument {
        text,
        mime_type: mime.to_string(),
        byte_size,
    })
}

fn parse_plaintext(path: &Path) -> Result<String, ParseError> {
    let bytes = std::fs::read(path)?;
    Ok(String::from_utf8(bytes)?)
}

fn parse_pdf(path: &Path) -> Result<String, ParseError> {
    pdf_extract::extract_text(path).map_err(|e| ParseError::Pdf(format!("{e}")))
}

fn parse_html(path: &Path) -> Result<String, ParseError> {
    let html = std::fs::read_to_string(path)?;
    // html2text wraps lines at `width` columns; pick a huge width so the
    // chunker isn't fed artificial line breaks that distort paragraph
    // boundaries. `from_read(input, width) -> Result<String, Error>` since
    // html2text 0.13.
    html2text::from_read(html.as_bytes(), 80_000).map_err(|e| ParseError::Html(format!("{e}")))
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Write;
    use tempfile::TempDir;

    fn write_file(dir: &TempDir, name: &str, body: &[u8]) -> std::path::PathBuf {
        let path = dir.path().join(name);
        let mut f = std::fs::File::create(&path).unwrap();
        f.write_all(body).unwrap();
        f.sync_all().unwrap();
        path
    }

    #[test]
    fn parse_markdown_file_returns_text() {
        let tmp = TempDir::new().unwrap();
        let body = "# Hello\n\nThis is a markdown file.";
        let path = write_file(&tmp, "note.md", body.as_bytes());

        let out = parse_file(&path).unwrap();
        assert_eq!(out.text, body);
        assert_eq!(out.mime_type, "text/markdown");
        assert_eq!(out.byte_size, body.len() as u64);
    }

    #[test]
    fn parse_plain_text_file() {
        let tmp = TempDir::new().unwrap();
        let body = "Hello world.\n";
        let path = write_file(&tmp, "x.txt", body.as_bytes());
        let out = parse_file(&path).unwrap();
        assert_eq!(out.text, body);
        assert_eq!(out.mime_type, "text/plain");
    }

    #[test]
    fn parse_rust_source() {
        let tmp = TempDir::new().unwrap();
        let body = "fn main() {\n    println!(\"hi\");\n}\n";
        let path = write_file(&tmp, "main.rs", body.as_bytes());
        let out = parse_file(&path).unwrap();
        assert_eq!(out.text, body);
        assert_eq!(out.mime_type, "text/x-rust");
    }

    #[test]
    fn parse_uppercase_extension_is_accepted() {
        // README.MD should work — extension matching is case-insensitive.
        let tmp = TempDir::new().unwrap();
        let body = "# upper";
        let path = write_file(&tmp, "README.MD", body.as_bytes());
        let out = parse_file(&path).unwrap();
        assert_eq!(out.mime_type, "text/markdown");
    }

    #[test]
    fn parse_html_strips_tags() {
        let tmp = TempDir::new().unwrap();
        // html2text strips `<script>` content by default; the visible "hello"
        // must survive but the script body must not appear as text.
        let body = "<html><body><p>hello world</p><script>var x = 'nope';</script></body></html>";
        let path = write_file(&tmp, "page.html", body.as_bytes());
        let out = parse_file(&path).unwrap();
        assert!(
            out.text.contains("hello world"),
            "expected 'hello world' in: {:?}",
            out.text
        );
        assert!(
            !out.text.contains("nope"),
            "script body should not appear in text: {:?}",
            out.text
        );
        assert_eq!(out.mime_type, "text/html");
    }

    #[test]
    fn parse_unsupported_extension_errors() {
        let tmp = TempDir::new().unwrap();
        let path = write_file(&tmp, "blob.bin", b"\x00\x01\x02");
        let err = parse_file(&path).unwrap_err();
        match err {
            ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "bin"),
            other => panic!("expected UnsupportedExtension, got {other:?}"),
        }
    }

    #[test]
    fn parse_file_without_extension_errors() {
        let tmp = TempDir::new().unwrap();
        let path = write_file(&tmp, "noext", b"hello");
        let err = parse_file(&path).unwrap_err();
        match err {
            ParseError::UnsupportedExtension(ext) => assert_eq!(ext, "(no extension)"),
            other => panic!("expected UnsupportedExtension, got {other:?}"),
        }
    }

    #[test]
    fn parse_empty_file_errors_with_empty_variant() {
        let tmp = TempDir::new().unwrap();
        let path = write_file(&tmp, "empty.txt", b"");
        let err = parse_file(&path).unwrap_err();
        assert!(matches!(err, ParseError::Empty), "got: {err:?}");
    }

    #[test]
    fn parse_whitespace_only_file_errors_with_empty_variant() {
        // A file containing only whitespace should also be treated as empty —
        // there's nothing to chunk or embed.
        let tmp = TempDir::new().unwrap();
        let path = write_file(&tmp, "ws.txt", b"   \n\t\n  \n");
        let err = parse_file(&path).unwrap_err();
        assert!(matches!(err, ParseError::Empty), "got: {err:?}");
    }

    #[test]
    fn parse_returns_byte_size_correctly() {
        let tmp = TempDir::new().unwrap();
        let body = b"abcdefghij"; // 10 bytes
        let path = write_file(&tmp, "sized.txt", body);
        let out = parse_file(&path).unwrap();
        assert_eq!(out.byte_size, 10);
    }

    #[test]
    fn parse_invalid_utf8_errors() {
        // Non-UTF8 bytes in a .txt file should surface as InvalidUtf8, not
        // panic and not silently lossy-decode.
        let tmp = TempDir::new().unwrap();
        let path = write_file(&tmp, "bad.txt", &[0xff, 0xfe, 0xfd]);
        let err = parse_file(&path).unwrap_err();
        assert!(matches!(err, ParseError::InvalidUtf8(_)), "got: {err:?}");
    }

    /// A minimal valid PDF 1.4 byte stream containing the literal text
    /// "Hello PDF". Generated inline (no binary fixture commit) so the
    /// test is self-contained and reproducible across platforms.
    ///
    /// Structure: 1 catalog, 1 pages, 1 page object, 1 font, 1 content
    /// stream. Total ~500 bytes. Built with manual xref offsets — the
    /// numbers below are fragile; if any object body changes, recompute
    /// `xref` positions.
    fn minimal_pdf() -> Vec<u8> {
        // Build the body first; xref offsets are computed post-hoc.
        let objects: [&str; 5] = [
            "1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n",
            "2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n",
            "3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] \
             /Resources << /Font << /F1 4 0 R >> >> /Contents 5 0 R >>\nendobj\n",
            "4 0 obj\n<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>\nendobj\n",
            "5 0 obj\n<< /Length 44 >>\nstream\nBT\n/F1 24 Tf\n72 720 Td\n(Hello PDF) Tj\nET\nendstream\nendobj\n",
        ];

        let mut buf = Vec::new();
        buf.extend_from_slice(b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"); // header + binary marker
        let mut offsets: Vec<usize> = Vec::with_capacity(objects.len());
        for obj in &objects {
            offsets.push(buf.len());
            buf.extend_from_slice(obj.as_bytes());
        }
        let xref_offset = buf.len();
        buf.extend_from_slice(format!("xref\n0 {}\n", objects.len() + 1).as_bytes());
        buf.extend_from_slice(b"0000000000 65535 f \n");
        for off in &offsets {
            buf.extend_from_slice(format!("{:010} 00000 n \n", off).as_bytes());
        }
        buf.extend_from_slice(
            format!(
                "trailer\n<< /Size {} /Root 1 0 R >>\nstartxref\n{}\n%%EOF\n",
                objects.len() + 1,
                xref_offset
            )
            .as_bytes(),
        );
        buf
    }

    #[test]
    fn parse_pdf_extracts_known_text() {
        // Generate a tiny synthetic PDF at test time so we don't commit a
        // binary fixture. pdf-extract's text extraction is best-effort; if
        // it fails on this platform we log + continue (don't fail the
        // build). The build-blocking guarantee is that `parse_file` does
        // not panic and routes through `parse_pdf`.
        let tmp = TempDir::new().unwrap();
        let path = write_file(&tmp, "hello.pdf", &minimal_pdf());

        match parse_file(&path) {
            Ok(out) => {
                assert_eq!(out.mime_type, "application/pdf");
                // The fixture's content is the literal string "Hello PDF".
                // pdf-extract may return the text with or without surrounding
                // whitespace; be lenient.
                assert!(
                    out.text.to_lowercase().contains("hello"),
                    "extracted text missing 'hello': {:?}",
                    out.text
                );
            }
            Err(ParseError::Empty) => {
                // pdf-extract found no extractable text in our minimal PDF —
                // acceptable for this hand-crafted fixture. Log + pass.
                eprintln!("parse_pdf: extracted text was empty (acceptable for minimal fixture)");
            }
            Err(ParseError::Pdf(msg)) => {
                eprintln!(
                    "parse_pdf: pdf-extract rejected minimal fixture (acceptable): {msg}"
                );
            }
            Err(other) => panic!("parse_pdf: unexpected error variant: {other:?}"),
        }
    }
}