Skip to main content

mailrs_attachment_extract/
lib.rs

1#![doc = include_str!("../README.md")]
2#![deny(missing_docs)]
3#![deny(rustdoc::broken_intra_doc_links)]
4
5use std::io::Write;
6use std::process::Command;
7
8use serde::Serialize;
9
10/// Result of an extraction attempt — text content + provenance metadata
11/// (language hint, confidence, page count) suitable for indexing or
12/// embedding generation downstream.
13#[derive(Debug, Clone, Serialize)]
14pub struct ExtractionResult {
15    /// Extracted text content. Empty when the input was unsupported
16    /// or extraction produced nothing.
17    pub text: String,
18    /// BCP-47-ish language hint (`"eng"`, `"jpn+eng"`, ...) if known.
19    /// `None` for embedded PDF text (could be anything).
20    pub language: Option<String>,
21    /// 0.0–1.0 confidence. `1.0` for embedded PDF text (exact);
22    /// ~0.85 for successful OCR; `0.0` for failed extraction.
23    pub confidence: f64,
24    /// Page count when known (PDFs).
25    pub page_count: Option<u32>,
26    /// Free-form JSON metadata about the extraction method
27    /// (`{"method": "pdf_text"}` or `{"method": "ocr", "langs": "eng"}`).
28    pub metadata: serde_json::Value,
29}
30
31impl ExtractionResult {
32    /// Empty / failed-extraction sentinel with `text = ""` and
33    /// `confidence = 0.0`.
34    pub fn empty() -> Self {
35        Self {
36            text: String::new(),
37            language: None,
38            confidence: 0.0,
39            page_count: None,
40            metadata: serde_json::json!({}),
41        }
42    }
43}
44
45/// Which extraction backend applies to a given `Content-Type`.
46#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum ExtractionMethod {
48    /// `application/pdf` — try embedded text first, OCR fallback for scans.
49    PdfText,
50    /// `image/{png,jpeg,webp,tiff,bmp,gif}` — OCR via tesseract.
51    ImageOcr,
52    /// Anything else — caller should skip extraction.
53    Unsupported,
54}
55
56/// Choose an [`ExtractionMethod`] from a `Content-Type` string.
57/// Case-insensitive. Unknown types fall through to `Unsupported`.
58pub fn extraction_method(content_type: &str) -> ExtractionMethod {
59    let ct = content_type.to_ascii_lowercase();
60    if ct == "application/pdf" {
61        return ExtractionMethod::PdfText;
62    }
63    if ct.starts_with("image/")
64        && matches!(
65            ct.as_str(),
66            "image/png" | "image/jpeg" | "image/webp" | "image/tiff" | "image/bmp" | "image/gif"
67        )
68    {
69        return ExtractionMethod::ImageOcr;
70    }
71    ExtractionMethod::Unsupported
72}
73
74/// Extract embedded text from a PDF (pure Rust via `pdf-extract`).
75/// Confidence is `1.0` because embedded text is exact, not OCR'd.
76/// `page_count` is approximated by counting form-feed (`\u{000C}`)
77/// page-break markers; off-by-one is possible for malformed PDFs.
78pub fn extract_pdf_text(data: &[u8]) -> Result<ExtractionResult, String> {
79    let text = pdf_extract::extract_text_from_mem(data).map_err(|e| format!("pdf parse: {e}"))?;
80    let trimmed = text.trim().to_string();
81    let page_count = text.matches('\u{000C}').count() as u32 + 1;
82    Ok(ExtractionResult {
83        text: trimmed,
84        language: None,
85        confidence: 1.0,
86        page_count: Some(page_count),
87        metadata: serde_json::json!({ "method": "pdf_text" }),
88    })
89}
90
91/// Check whether the `tesseract` CLI binary is on `PATH`.
92/// Spawns `tesseract --version` and checks for success — no caching.
93/// If you'll call this on a hot path, cache the result yourself.
94pub fn tesseract_available() -> bool {
95    Command::new("tesseract")
96        .arg("--version")
97        .output()
98        .is_ok()
99}
100
101/// OCR an image via the `tesseract` CLI subprocess.
102///
103/// `langs` is the tesseract `-l` value (e.g. `"eng"`, `"jpn+eng"`).
104/// Writes `data` to a temp file (`tesseract` can't read stdin for
105/// image data), runs `tesseract <tmp> stdout -l <langs> --psm 3`,
106/// captures stdout as the extracted text. Confidence is heuristic
107/// (0.85 default, 0.0 on "Empty page" stderr signal).
108pub fn ocr_image(data: &[u8], langs: &str) -> Result<ExtractionResult, String> {
109    let mut tmp = tempfile::Builder::new()
110        .suffix(".img")
111        .tempfile()
112        .map_err(|e| format!("tempfile: {e}"))?;
113    tmp.write_all(data)
114        .map_err(|e| format!("write temp: {e}"))?;
115    tmp.flush().map_err(|e| format!("flush temp: {e}"))?;
116
117    let output = Command::new("tesseract")
118        .arg(tmp.path())
119        .arg("stdout")
120        .arg("-l")
121        .arg(langs)
122        .arg("--psm")
123        .arg("3")
124        .output()
125        .map_err(|e| format!("tesseract exec: {e}"))?;
126
127    if !output.status.success() {
128        let stderr = String::from_utf8_lossy(&output.stderr);
129        return Err(format!("tesseract failed: {stderr}"));
130    }
131
132    let text = String::from_utf8_lossy(&output.stdout).trim().to_string();
133    let confidence = parse_tesseract_confidence(&output.stderr);
134
135    Ok(ExtractionResult {
136        text,
137        language: Some(langs.to_string()),
138        confidence,
139        page_count: None,
140        metadata: serde_json::json!({ "method": "ocr", "langs": langs }),
141    })
142}
143
144fn parse_tesseract_confidence(stderr: &[u8]) -> f64 {
145    let text = String::from_utf8_lossy(stderr);
146    if text.contains("Empty page") {
147        return 0.0;
148    }
149    0.85
150}
151
152/// Auto-dispatch: pick the right extractor for `content_type` and run.
153///
154/// PDF path: try embedded text first; if the result is shorter than
155/// 50 chars (heuristic for "scanned PDF with no embedded text"),
156/// fall back to OCR on the raw bytes. Image path: OCR directly.
157/// Unsupported types return [`ExtractionResult::empty`] (not an
158/// error — caller should skip indexing but not log a failure).
159pub fn extract_content(
160    data: &[u8],
161    content_type: &str,
162    ocr_langs: &str,
163) -> Result<ExtractionResult, String> {
164    match extraction_method(content_type) {
165        ExtractionMethod::PdfText => {
166            let result = extract_pdf_text(data)?;
167            if result.text.len() < 50 && tesseract_available() {
168                match ocr_image(data, ocr_langs) {
169                    Ok(ocr_result) if !ocr_result.text.is_empty() => Ok(ocr_result),
170                    _ => Ok(result),
171                }
172            } else {
173                Ok(result)
174            }
175        }
176        ExtractionMethod::ImageOcr => {
177            if !tesseract_available() {
178                return Err("tesseract not installed".to_string());
179            }
180            ocr_image(data, ocr_langs)
181        }
182        ExtractionMethod::Unsupported => Ok(ExtractionResult::empty()),
183    }
184}
185
186/// Recommended upper bound on input size for [`extract_content`] —
187/// 50 MiB. Caller's choice whether to enforce; we don't enforce
188/// internally because the right limit varies by deployment (an
189/// archive-grade system may want 500 MiB, a mobile MTA may want 5).
190pub const MAX_EXTRACT_SIZE: usize = 50 * 1024 * 1024;
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn method_pdf() {
198        assert_eq!(extraction_method("application/pdf"), ExtractionMethod::PdfText);
199    }
200
201    #[test]
202    fn method_pdf_case_insensitive() {
203        assert_eq!(extraction_method("Application/PDF"), ExtractionMethod::PdfText);
204    }
205
206    #[test]
207    fn method_png() {
208        assert_eq!(extraction_method("image/png"), ExtractionMethod::ImageOcr);
209    }
210
211    #[test]
212    fn method_jpeg() {
213        assert_eq!(extraction_method("image/jpeg"), ExtractionMethod::ImageOcr);
214    }
215
216    #[test]
217    fn method_webp() {
218        assert_eq!(extraction_method("image/webp"), ExtractionMethod::ImageOcr);
219    }
220
221    #[test]
222    fn method_tiff() {
223        assert_eq!(extraction_method("image/tiff"), ExtractionMethod::ImageOcr);
224    }
225
226    #[test]
227    fn method_svg_unsupported() {
228        assert_eq!(extraction_method("image/svg+xml"), ExtractionMethod::Unsupported);
229    }
230
231    #[test]
232    fn method_word_unsupported() {
233        assert_eq!(
234            extraction_method(
235                "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
236            ),
237            ExtractionMethod::Unsupported
238        );
239    }
240
241    #[test]
242    fn method_text_unsupported() {
243        assert_eq!(extraction_method("text/plain"), ExtractionMethod::Unsupported);
244    }
245
246    #[test]
247    fn method_empty_unsupported() {
248        assert_eq!(extraction_method(""), ExtractionMethod::Unsupported);
249    }
250
251    #[test]
252    fn extract_unsupported_returns_empty() {
253        let result = extract_content(b"hello", "text/plain", "eng").unwrap();
254        assert!(result.text.is_empty());
255        assert_eq!(result.confidence, 0.0);
256    }
257
258    #[test]
259    fn extract_pdf_text_invalid_data() {
260        let result = extract_pdf_text(b"not a pdf");
261        assert!(result.is_err());
262    }
263
264    #[test]
265    fn extract_pdf_text_minimal() {
266        let pdf_bytes = create_minimal_pdf("Hello World");
267        let _ = extract_pdf_text(&pdf_bytes);
268    }
269
270    #[test]
271    fn ocr_image_no_tesseract_graceful() {
272        if !tesseract_available() {
273            let result = extract_content(b"\x89PNG", "image/png", "eng");
274            assert!(result.is_err());
275        }
276    }
277
278    #[test]
279    fn ocr_image_with_tesseract() {
280        if !tesseract_available() {
281            return;
282        }
283        let img = image::RgbImage::from_fn(200, 50, |x, _y| {
284            if x > 50 && x < 150 {
285                image::Rgb([0u8, 0, 0])
286            } else {
287                image::Rgb([255u8, 255, 255])
288            }
289        });
290        let mut buf = Vec::new();
291        let mut cursor = std::io::Cursor::new(&mut buf);
292        img.write_to(&mut cursor, image::ImageFormat::Png).unwrap();
293        let result = ocr_image(&buf, "eng");
294        assert!(result.is_ok());
295    }
296
297    #[test]
298    fn empty_result() {
299        let r = ExtractionResult::empty();
300        assert!(r.text.is_empty());
301        assert!(r.language.is_none());
302        assert_eq!(r.confidence, 0.0);
303        assert!(r.page_count.is_none());
304    }
305
306    #[test]
307    fn confidence_empty_page() {
308        assert_eq!(parse_tesseract_confidence(b"Empty page"), 0.0);
309    }
310
311    #[test]
312    fn confidence_default() {
313        assert_eq!(parse_tesseract_confidence(b"some output"), 0.85);
314    }
315
316    fn create_minimal_pdf(text: &str) -> Vec<u8> {
317        format!(
318            "%PDF-1.0\n\
319            1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
320            2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
321            3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>endobj\n\
322            4 0 obj<</Length {}>>stream\nBT /F1 12 Tf 100 700 Td ({}) Tj ET\nendstream\nendobj\n\
323            5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\n\
324            xref\n0 6\n\
325            0000000000 65535 f \n\
326            0000000009 00000 n \n\
327            0000000058 00000 n \n\
328            0000000115 00000 n \n\
329            0000000266 00000 n \n\
330            0000000400 00000 n \n\
331            trailer<</Size 6/Root 1 0 R>>\nstartxref\n474\n%%EOF",
332            text.len() + 45,
333            text
334        )
335        .into_bytes()
336    }
337}