use oxidize_pdf::parser::PdfReader;
use oxidize_pdf::text::{ExtractionOptions, TextExtractor};
use oxidize_pdf::{Document, Font, Page};
use tempfile::TempDir;
#[test]
fn test_extract_text_from_generated_pdf() {
let mut doc = Document::new();
let mut page = Page::a4();
page.text()
.set_font(Font::Helvetica, 12.0)
.at(100.0, 700.0)
.write("Hello, World!")
.unwrap()
.at(100.0, 680.0)
.write("This is a test PDF.")
.unwrap()
.at(100.0, 660.0)
.write("Testing text extraction.")
.unwrap();
doc.add_page(page);
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("test.pdf");
doc.save(&pdf_path).unwrap();
let pdf_doc = PdfReader::open_document(&pdf_path).unwrap();
let mut extractor = TextExtractor::new();
let extracted = extractor.extract_from_page(&pdf_doc, 0).unwrap();
assert!(extracted.text.contains("Hello, World!"));
assert!(extracted.text.contains("This is a test PDF."));
assert!(extracted.text.contains("Testing text extraction."));
}
#[test]
fn test_extract_with_layout_preservation() {
let mut doc = Document::new();
let mut page = Page::a4();
page.text()
.set_font(Font::Helvetica, 14.0)
.at(50.0, 700.0)
.write("Left text")
.unwrap()
.at(300.0, 700.0)
.write("Right text")
.unwrap()
.at(50.0, 600.0)
.write("Lower text")
.unwrap();
doc.add_page(page);
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("layout_test.pdf");
doc.save(&pdf_path).unwrap();
let pdf_doc = PdfReader::open_document(&pdf_path).unwrap();
let options = ExtractionOptions {
preserve_layout: true,
..Default::default()
};
let mut extractor = TextExtractor::with_options(options);
let extracted = extractor.extract_from_page(&pdf_doc, 0).unwrap();
assert!(!extracted.fragments.is_empty());
let positions: Vec<(f64, f64)> = extracted.fragments.iter().map(|f| (f.x, f.y)).collect();
assert!(positions.len() >= 3, "Expected at least 3 text fragments");
}
#[test]
fn test_extract_multiple_pages() {
let mut doc = Document::new();
for i in 0..3 {
let mut page = Page::a4();
page.text()
.set_font(Font::Helvetica, 12.0)
.at(100.0, 700.0)
.write(&format!("This is page {}", i + 1))
.unwrap();
doc.add_page(page);
}
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("multipage_test.pdf");
doc.save(&pdf_path).unwrap();
let pdf_doc = PdfReader::open_document(&pdf_path).unwrap();
let mut extractor = TextExtractor::new();
let all_pages = extractor.extract_from_document(&pdf_doc).unwrap();
assert_eq!(all_pages.len(), 3);
for (i, extracted) in all_pages.iter().enumerate() {
assert!(extracted.text.contains(&format!("This is page {}", i + 1)));
}
}
#[test]
fn test_extract_empty_page() {
let mut doc = Document::new();
let page = Page::a4(); doc.add_page(page);
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("empty_test.pdf");
doc.save(&pdf_path).unwrap();
let pdf_doc = PdfReader::open_document(&pdf_path).unwrap();
let mut extractor = TextExtractor::new();
let extracted = extractor.extract_from_page(&pdf_doc, 0).unwrap();
assert!(extracted.text.is_empty());
assert!(extracted.fragments.is_empty());
}
#[test]
fn test_extract_from_manual_pdf() {
let pdf_content = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>
endobj
4 0 obj
<< /Length 44 >>
stream
BT
/F1 12 Tf
100 700 Td
(Hello World) Tj
ET
endstream
endobj
5 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000241 00000 n
0000000334 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
404
%%EOF";
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("manual_test.pdf");
std::fs::write(&pdf_path, pdf_content).unwrap();
let document = PdfReader::open_document(&pdf_path).unwrap();
let mut extractor = TextExtractor::new();
let extracted_text = extractor.extract_from_document(&document).unwrap();
assert_eq!(extracted_text.len(), 1);
assert_eq!(extracted_text[0].text.trim(), "Hello World");
}
#[test]
fn test_extract_with_multiple_text_operations() {
let pdf_content = b"%PDF-1.4
1 0 obj
<< /Type /Catalog /Pages 2 0 R >>
endobj
2 0 obj
<< /Type /Pages /Kids [3 0 R] /Count 1 >>
endobj
3 0 obj
<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R /Resources << /Font << /F1 5 0 R >> >> >>
endobj
4 0 obj
<< /Length 93 >>
stream
BT
/F1 12 Tf
100 700 Td
(First line) Tj
0 -20 Td
(Second line) Tj
200 0 Td
(Third line) Tj
ET
endstream
endobj
5 0 obj
<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>
endobj
xref
0 6
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
0000000241 00000 n
0000000384 00000 n
trailer
<< /Size 6 /Root 1 0 R >>
startxref
454
%%EOF";
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("multi_text.pdf");
std::fs::write(&pdf_path, pdf_content).unwrap();
let document = PdfReader::open_document(&pdf_path).unwrap();
let options = ExtractionOptions {
preserve_layout: true,
..Default::default()
};
let mut extractor = TextExtractor::with_options(options);
let extracted_text = extractor.extract_from_document(&document).unwrap();
assert_eq!(extracted_text.len(), 1);
let page_text = &extracted_text[0];
assert!(page_text.text.contains("First line"));
assert!(page_text.text.contains("Second line"));
assert!(page_text.text.contains("Third line"));
assert!(page_text.fragments.len() >= 3);
}
#[test]
fn test_extract_text_from_page_with_options() {
let mut doc = Document::new();
let mut page = Page::a4();
page.text()
.set_font(Font::Helvetica, 12.0)
.at(100.0, 700.0)
.write("Test content for extraction")
.unwrap();
doc.add_page(page);
let temp_dir = TempDir::new().unwrap();
let pdf_path = temp_dir.path().join("options_test.pdf");
doc.save(&pdf_path).unwrap();
let pdf_doc = PdfReader::open_document(&pdf_path).unwrap();
let options = ExtractionOptions {
space_threshold: 0.4,
preserve_layout: true,
..Default::default()
};
let extracted = pdf_doc
.extract_text_from_page_with_options(0, options)
.unwrap();
assert!(extracted.text.contains("Test content for extraction"));
assert!(!extracted.fragments.is_empty());
}