mailrs_attachment_extract/
lib.rs1#![doc = include_str!("../README.md")]
2#![deny(missing_docs)]
3#![deny(rustdoc::broken_intra_doc_links)]
4
5use std::io::Write;
6use std::process::Command;
7
8use serde::Serialize;
9
10#[derive(Debug, Clone, Serialize)]
14pub struct ExtractionResult {
15 pub text: String,
18 pub language: Option<String>,
21 pub confidence: f64,
24 pub page_count: Option<u32>,
26 pub metadata: serde_json::Value,
29}
30
31impl ExtractionResult {
32 pub fn empty() -> Self {
35 Self {
36 text: String::new(),
37 language: None,
38 confidence: 0.0,
39 page_count: None,
40 metadata: serde_json::json!({}),
41 }
42 }
43}
44
45#[derive(Debug, Clone, Copy, PartialEq, Eq)]
47pub enum ExtractionMethod {
48 PdfText,
50 ImageOcr,
52 Unsupported,
54}
55
56pub fn extraction_method(content_type: &str) -> ExtractionMethod {
59 let ct = content_type.to_ascii_lowercase();
60 if ct == "application/pdf" {
61 return ExtractionMethod::PdfText;
62 }
63 if ct.starts_with("image/")
64 && matches!(
65 ct.as_str(),
66 "image/png" | "image/jpeg" | "image/webp" | "image/tiff" | "image/bmp" | "image/gif"
67 )
68 {
69 return ExtractionMethod::ImageOcr;
70 }
71 ExtractionMethod::Unsupported
72}
73
74pub fn extract_pdf_text(data: &[u8]) -> Result<ExtractionResult, String> {
79 let text = pdf_extract::extract_text_from_mem(data).map_err(|e| format!("pdf parse: {e}"))?;
80 let trimmed = text.trim().to_string();
81 let page_count = text.matches('\u{000C}').count() as u32 + 1;
82 Ok(ExtractionResult {
83 text: trimmed,
84 language: None,
85 confidence: 1.0,
86 page_count: Some(page_count),
87 metadata: serde_json::json!({ "method": "pdf_text" }),
88 })
89}
90
91pub fn tesseract_available() -> bool {
95 Command::new("tesseract")
96 .arg("--version")
97 .output()
98 .is_ok()
99}
100
101pub fn ocr_image(data: &[u8], langs: &str) -> Result<ExtractionResult, String> {
109 let mut tmp = tempfile::Builder::new()
110 .suffix(".img")
111 .tempfile()
112 .map_err(|e| format!("tempfile: {e}"))?;
113 tmp.write_all(data)
114 .map_err(|e| format!("write temp: {e}"))?;
115 tmp.flush().map_err(|e| format!("flush temp: {e}"))?;
116
117 let output = Command::new("tesseract")
118 .arg(tmp.path())
119 .arg("stdout")
120 .arg("-l")
121 .arg(langs)
122 .arg("--psm")
123 .arg("3")
124 .output()
125 .map_err(|e| format!("tesseract exec: {e}"))?;
126
127 if !output.status.success() {
128 let stderr = String::from_utf8_lossy(&output.stderr);
129 return Err(format!("tesseract failed: {stderr}"));
130 }
131
132 let text = String::from_utf8_lossy(&output.stdout).trim().to_string();
133 let confidence = parse_tesseract_confidence(&output.stderr);
134
135 Ok(ExtractionResult {
136 text,
137 language: Some(langs.to_string()),
138 confidence,
139 page_count: None,
140 metadata: serde_json::json!({ "method": "ocr", "langs": langs }),
141 })
142}
143
144fn parse_tesseract_confidence(stderr: &[u8]) -> f64 {
145 let text = String::from_utf8_lossy(stderr);
146 if text.contains("Empty page") {
147 return 0.0;
148 }
149 0.85
150}
151
152pub fn extract_content(
160 data: &[u8],
161 content_type: &str,
162 ocr_langs: &str,
163) -> Result<ExtractionResult, String> {
164 match extraction_method(content_type) {
165 ExtractionMethod::PdfText => {
166 let result = extract_pdf_text(data)?;
167 if result.text.len() < 50 && tesseract_available() {
168 match ocr_image(data, ocr_langs) {
169 Ok(ocr_result) if !ocr_result.text.is_empty() => Ok(ocr_result),
170 _ => Ok(result),
171 }
172 } else {
173 Ok(result)
174 }
175 }
176 ExtractionMethod::ImageOcr => {
177 if !tesseract_available() {
178 return Err("tesseract not installed".to_string());
179 }
180 ocr_image(data, ocr_langs)
181 }
182 ExtractionMethod::Unsupported => Ok(ExtractionResult::empty()),
183 }
184}
185
186pub const MAX_EXTRACT_SIZE: usize = 50 * 1024 * 1024;
191
192#[cfg(test)]
193mod tests {
194 use super::*;
195
196 #[test]
197 fn method_pdf() {
198 assert_eq!(extraction_method("application/pdf"), ExtractionMethod::PdfText);
199 }
200
201 #[test]
202 fn method_pdf_case_insensitive() {
203 assert_eq!(extraction_method("Application/PDF"), ExtractionMethod::PdfText);
204 }
205
206 #[test]
207 fn method_png() {
208 assert_eq!(extraction_method("image/png"), ExtractionMethod::ImageOcr);
209 }
210
211 #[test]
212 fn method_jpeg() {
213 assert_eq!(extraction_method("image/jpeg"), ExtractionMethod::ImageOcr);
214 }
215
216 #[test]
217 fn method_webp() {
218 assert_eq!(extraction_method("image/webp"), ExtractionMethod::ImageOcr);
219 }
220
221 #[test]
222 fn method_tiff() {
223 assert_eq!(extraction_method("image/tiff"), ExtractionMethod::ImageOcr);
224 }
225
226 #[test]
227 fn method_svg_unsupported() {
228 assert_eq!(extraction_method("image/svg+xml"), ExtractionMethod::Unsupported);
229 }
230
231 #[test]
232 fn method_word_unsupported() {
233 assert_eq!(
234 extraction_method(
235 "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
236 ),
237 ExtractionMethod::Unsupported
238 );
239 }
240
241 #[test]
242 fn method_text_unsupported() {
243 assert_eq!(extraction_method("text/plain"), ExtractionMethod::Unsupported);
244 }
245
246 #[test]
247 fn method_empty_unsupported() {
248 assert_eq!(extraction_method(""), ExtractionMethod::Unsupported);
249 }
250
251 #[test]
252 fn extract_unsupported_returns_empty() {
253 let result = extract_content(b"hello", "text/plain", "eng").unwrap();
254 assert!(result.text.is_empty());
255 assert_eq!(result.confidence, 0.0);
256 }
257
258 #[test]
259 fn extract_pdf_text_invalid_data() {
260 let result = extract_pdf_text(b"not a pdf");
261 assert!(result.is_err());
262 }
263
264 #[test]
265 fn extract_pdf_text_minimal() {
266 let pdf_bytes = create_minimal_pdf("Hello World");
267 let _ = extract_pdf_text(&pdf_bytes);
268 }
269
270 #[test]
271 fn ocr_image_no_tesseract_graceful() {
272 if !tesseract_available() {
273 let result = extract_content(b"\x89PNG", "image/png", "eng");
274 assert!(result.is_err());
275 }
276 }
277
278 #[test]
279 fn ocr_image_with_tesseract() {
280 if !tesseract_available() {
281 return;
282 }
283 let img = image::RgbImage::from_fn(200, 50, |x, _y| {
284 if x > 50 && x < 150 {
285 image::Rgb([0u8, 0, 0])
286 } else {
287 image::Rgb([255u8, 255, 255])
288 }
289 });
290 let mut buf = Vec::new();
291 let mut cursor = std::io::Cursor::new(&mut buf);
292 img.write_to(&mut cursor, image::ImageFormat::Png).unwrap();
293 let result = ocr_image(&buf, "eng");
294 assert!(result.is_ok());
295 }
296
297 #[test]
298 fn empty_result() {
299 let r = ExtractionResult::empty();
300 assert!(r.text.is_empty());
301 assert!(r.language.is_none());
302 assert_eq!(r.confidence, 0.0);
303 assert!(r.page_count.is_none());
304 }
305
306 #[test]
307 fn confidence_empty_page() {
308 assert_eq!(parse_tesseract_confidence(b"Empty page"), 0.0);
309 }
310
311 #[test]
312 fn confidence_default() {
313 assert_eq!(parse_tesseract_confidence(b"some output"), 0.85);
314 }
315
316 fn create_minimal_pdf(text: &str) -> Vec<u8> {
317 format!(
318 "%PDF-1.0\n\
319 1 0 obj<</Type/Catalog/Pages 2 0 R>>endobj\n\
320 2 0 obj<</Type/Pages/Kids[3 0 R]/Count 1>>endobj\n\
321 3 0 obj<</Type/Page/MediaBox[0 0 612 792]/Parent 2 0 R/Contents 4 0 R/Resources<</Font<</F1 5 0 R>>>>>>endobj\n\
322 4 0 obj<</Length {}>>stream\nBT /F1 12 Tf 100 700 Td ({}) Tj ET\nendstream\nendobj\n\
323 5 0 obj<</Type/Font/Subtype/Type1/BaseFont/Helvetica>>endobj\n\
324 xref\n0 6\n\
325 0000000000 65535 f \n\
326 0000000009 00000 n \n\
327 0000000058 00000 n \n\
328 0000000115 00000 n \n\
329 0000000266 00000 n \n\
330 0000000400 00000 n \n\
331 trailer<</Size 6/Root 1 0 R>>\nstartxref\n474\n%%EOF",
332 text.len() + 45,
333 text
334 )
335 .into_bytes()
336 }
337}