1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
//! PDF text extraction implementation.
//!
//! This module provides utilities for extracting text and metadata from PDF files
//! using the `pdf-extract` library.
use crate::core::platform::container::document::{Document, DocumentError, DocumentMetadata, Page};
use std::path::Path;
/// PDF text extractor.
///
/// Handles extraction of text content and metadata from PDF files.
/// Supports both file path and byte array input.
#[doc(hidden)]
#[derive(Debug, Clone, Default)]
pub struct PdfExtractor;
impl PdfExtractor {
/// Creates a new PDF extractor.
pub fn new() -> Self {
Self
}
/// Extracts text and metadata from a PDF file.
///
/// # Arguments
///
/// * `path` - Path to the PDF file
///
/// # Returns
///
/// A `Document` containing extracted pages and metadata, or an error if extraction fails.
///
/// # Errors
///
/// Returns `DocumentError::EncryptedPdf` if the PDF requires a password.
/// Returns `DocumentError::CorruptedFile` if the PDF is malformed.
/// Returns `DocumentError::ExtractionFailed` if text extraction fails.
pub fn extract(&self, path: &Path) -> Result<Document, DocumentError> {
// Read file to bytes
let bytes = std::fs::read(path).map_err(|e| {
DocumentError::IoError(std::io::Error::new(
e.kind(),
format!("Failed to read PDF file: {}", e),
))
})?;
self.extract_bytes(&bytes)
}
/// Extracts text and metadata from PDF bytes.
///
/// # Arguments
///
/// * `bytes` - PDF file content as byte array
///
/// # Returns
///
/// A `Document` containing extracted pages and metadata, or an error if extraction fails.
///
/// # Errors
///
/// Returns `DocumentError::EncryptedPdf` if the PDF requires a password.
/// Returns `DocumentError::CorruptedFile` if the PDF is malformed.
/// Returns `DocumentError::ExtractionFailed` if text extraction fails.
pub fn extract_bytes(&self, bytes: &[u8]) -> Result<Document, DocumentError> {
// Extract text using pdf-extract
let extracted_text = pdf_extract::extract_text_from_mem(bytes).map_err(|e| {
let error_msg = e.to_string();
// Check for encrypted PDF
if error_msg.contains("encrypted") || error_msg.contains("password") {
return DocumentError::EncryptedPdf;
}
// Check for corrupted/malformed PDF
if error_msg.contains("invalid") || error_msg.contains("corrupt") {
return DocumentError::CorruptedFile(error_msg);
}
DocumentError::ExtractionFailed(error_msg)
})?;
// Parse the extracted text into pages
// pdf-extract returns all text as a single string, so we need to split it
let pages = self.parse_text_into_pages(&extracted_text);
// Extract metadata
let metadata = self.extract_metadata_from_bytes(bytes, pages.len())?;
Ok(Document::new(pages, metadata))
}
/// Parses extracted text into individual pages.
///
/// Since pdf-extract returns all text as one string, we attempt to detect
/// page breaks by looking for form feed characters or multiple consecutive newlines.
fn parse_text_into_pages(&self, text: &str) -> Vec<Page> {
// Handle empty text
if text.trim().is_empty() {
return Vec::new();
}
// Split on form feed character (page break)
let page_texts: Vec<&str> = text.split('\x0C').collect();
if page_texts.len() > 1 {
// Form feed characters found - use those as page breaks
page_texts
.into_iter()
.enumerate()
.filter(|(_, content)| !content.trim().is_empty())
.map(|(idx, content)| Page::new(idx + 1, self.preserve_structure(content)))
.collect()
} else {
// No form feed - try to split on multiple newlines (paragraph breaks)
// This is a heuristic and may not perfectly match actual PDF pages
let paragraphs: Vec<&str> = text.split("\n\n\n").collect();
if paragraphs.len() > 1 {
paragraphs
.into_iter()
.enumerate()
.filter(|(_, content)| !content.trim().is_empty())
.map(|(idx, content)| Page::new(idx + 1, self.preserve_structure(content)))
.collect()
} else {
// Single page or unable to detect breaks - treat as one page
vec![Page::new(1, self.preserve_structure(text))]
}
}
}
/// Preserves text structure (paragraphs, spacing).
///
/// Cleans up excessive whitespace while preserving paragraph breaks.
fn preserve_structure(&self, text: &str) -> String {
// Split into lines
let lines: Vec<&str> = text.lines().collect();
let mut result = String::new();
let mut prev_empty = false;
for line in lines {
let trimmed = line.trim();
if trimmed.is_empty() {
// Empty line - preserve one newline for paragraph break
if !prev_empty && !result.is_empty() {
result.push('\n');
}
prev_empty = true;
} else {
// Non-empty line - add it with newline
if !result.is_empty() && !prev_empty {
result.push(' '); // Join within paragraph
} else if !result.is_empty() {
result.push('\n'); // Paragraph break
}
result.push_str(trimmed);
prev_empty = false;
}
}
result
}
/// Extracts metadata from PDF bytes.
fn extract_metadata_from_bytes(
&self,
_bytes: &[u8],
page_count: usize,
) -> Result<DocumentMetadata, DocumentError> {
// pdf-extract doesn't provide metadata extraction
// For now, we'll return basic metadata with just page count
// In a production implementation, you'd use a library like `lopdf` for metadata
Ok(DocumentMetadata {
title: None,
author: None,
page_count,
creation_date: None,
})
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_pdf_extractor_creation() {
let extractor = PdfExtractor::new();
assert!(matches!(extractor, PdfExtractor));
}
#[test]
fn test_pdf_extractor_default() {
let extractor = PdfExtractor;
assert!(matches!(extractor, PdfExtractor));
}
#[test]
fn test_extract_bytes_invalid_pdf() {
let extractor = PdfExtractor::new();
let invalid_bytes = b"This is not a valid PDF";
let result = extractor.extract_bytes(invalid_bytes);
assert!(result.is_err());
// Should return ExtractionFailed for invalid PDF
match result {
Err(DocumentError::ExtractionFailed(_)) => (),
Err(DocumentError::CorruptedFile(_)) => (),
_ => panic!("Expected ExtractionFailed or CorruptedFile error"),
}
}
#[test]
fn test_extract_nonexistent_file() {
let extractor = PdfExtractor::new();
let path = Path::new("/nonexistent/file.pdf");
let result = extractor.extract(path);
assert!(result.is_err());
// Should return IoError for nonexistent file
match result {
Err(DocumentError::IoError(_)) => (),
_ => panic!("Expected IoError for nonexistent file"),
}
}
#[test]
fn test_parse_text_into_pages_single_page() {
let extractor = PdfExtractor::new();
let text = "This is page one content.";
let pages = extractor.parse_text_into_pages(text);
assert_eq!(pages.len(), 1);
assert_eq!(pages[0].number, 1);
assert!(pages[0].content.contains("page one"));
}
#[test]
fn test_parse_text_into_pages_empty_text() {
let extractor = PdfExtractor::new();
let text = "";
let pages = extractor.parse_text_into_pages(text);
// Empty text should return empty vec (filtered out)
assert_eq!(pages.len(), 0);
}
#[test]
fn test_parse_text_into_pages_with_form_feed() {
let extractor = PdfExtractor::new();
let text = "Page one content.\x0CPage two content.\x0CPage three content.";
let pages = extractor.parse_text_into_pages(text);
assert_eq!(pages.len(), 3);
assert_eq!(pages[0].number, 1);
assert!(pages[0].content.contains("Page one"));
assert_eq!(pages[1].number, 2);
assert!(pages[1].content.contains("Page two"));
assert_eq!(pages[2].number, 3);
assert!(pages[2].content.contains("Page three"));
}
#[test]
fn test_parse_text_into_pages_with_triple_newlines() {
let extractor = PdfExtractor::new();
let text = "Section one content.\n\n\nSection two content.\n\n\nSection three content.";
let pages = extractor.parse_text_into_pages(text);
// Should split on triple newlines
assert!(!pages.is_empty());
assert_eq!(pages[0].number, 1);
}
#[test]
fn test_preserve_structure_basic() {
let extractor = PdfExtractor::new();
let text = "Line one\nLine two\n\nNew paragraph\nContinuation";
let result = extractor.preserve_structure(text);
// Should preserve paragraph breaks
assert!(result.contains("Line one"));
assert!(result.contains("paragraph"));
assert!(result.contains("Continuation"));
}
#[test]
fn test_preserve_structure_excessive_whitespace() {
let extractor = PdfExtractor::new();
let text = "Line one \n\n\n\n Line two";
let result = extractor.preserve_structure(text);
// Should clean up excessive whitespace
assert!(result.contains("Line one"));
assert!(result.contains("Line two"));
// Should not have multiple consecutive spaces
assert!(!result.contains(" "));
}
#[test]
fn test_preserve_structure_empty() {
let extractor = PdfExtractor::new();
let text = "";
let result = extractor.preserve_structure(text);
assert_eq!(result, "");
}
#[test]
fn test_preserve_structure_only_whitespace() {
let extractor = PdfExtractor::new();
let text = " \n\n \n ";
let result = extractor.preserve_structure(text);
// Should return empty string for whitespace-only input
assert_eq!(result, "");
}
#[test]
fn test_extract_metadata_basic() {
let extractor = PdfExtractor::new();
let metadata = extractor.extract_metadata_from_bytes(&[], 5).unwrap();
assert_eq!(metadata.page_count, 5);
assert!(metadata.title.is_none());
assert!(metadata.author.is_none());
assert!(metadata.creation_date.is_none());
}
#[test]
fn test_extract_metadata_zero_pages() {
let extractor = PdfExtractor::new();
let metadata = extractor.extract_metadata_from_bytes(&[], 0).unwrap();
assert_eq!(metadata.page_count, 0);
}
#[test]
fn test_extract_metadata_large_document() {
let extractor = PdfExtractor::new();
let metadata = extractor.extract_metadata_from_bytes(&[], 1000).unwrap();
assert_eq!(metadata.page_count, 1000);
}
}