1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
// Example: Test text extraction on random PDFs from middle pages
use oxidize_pdf::text::{ExtractionOptions, TextExtractor};
use oxidize_pdf::{PdfDocument, PdfReader};
use std::path::Path;
fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("🎲 Testing text extraction on 10 random PDFs");
println!("{}", "=".repeat(60));
let test_pdfs = vec![
"./test-pdfs/unicode_glyph_mapping_test.pdf",
"./test-pdfs/Cold_Email_Hacks.pdf",
"./oxidize-pdf-core/output.pdf",
"./test-pdfs/unicode_showcase.pdf",
"./oxidize-pdf-core/examples/results/empty_dashboard_test.pdf",
"./test-pdfs/unicode_professional_demo.pdf",
"./oxidize-pdf-core/examples/results/large_dashboard_test.pdf",
"./test-pdfs/jpeg_extraction_test.pdf",
"./test-pdfs/png_transparency.pdf",
"./test-pdfs/page_tree_inheritance.pdf",
];
let options = ExtractionOptions::default();
let mut extractor = TextExtractor::with_options(options);
let mut successful_extractions = 0;
let mut total_chars_extracted = 0;
let mut readable_pdfs = 0;
for (i, pdf_path) in test_pdfs.iter().enumerate() {
println!("\n📄 {} | {}", i + 1, pdf_path);
println!("{}", "-".repeat(50));
if !Path::new(pdf_path).exists() {
println!(" ❌ File does not exist - skipping");
continue;
}
match PdfReader::open(pdf_path) {
Ok(reader) => {
let document = PdfDocument::new(reader);
match document.page_count() {
Ok(page_count) => {
println!(" 📊 Total pages: {}", page_count);
// Get middle page (1-indexed)
let middle_page = if page_count == 1 {
1
} else {
(page_count / 2) + 1
};
println!(" 🎯 Testing middle page: {}", middle_page);
match extractor.extract_from_page(&document, middle_page) {
Ok(text) => {
let content = text.text.trim();
successful_extractions += 1;
total_chars_extracted += content.len();
if content.is_empty() {
println!(" ⚠️ Empty or no extractable text");
} else {
println!(" ✅ Extracted {} characters", content.len());
// Show preview (first 200 chars)
let preview_len = 200.min(content.len());
let mut end = preview_len;
while end > 0 && !content.is_char_boundary(end) {
end -= 1;
}
let preview = &content[..end];
println!(
" 📝 Preview: \"{}{}\"",
preview,
if content.len() > preview_len {
"..."
} else {
""
}
);
// Check readability
let readable_chars = content
.chars()
.filter(|c| {
c.is_alphabetic()
|| c.is_whitespace()
|| c.is_ascii_punctuation()
})
.count();
let total_chars = content.chars().count();
let readability = if total_chars > 0 {
readable_chars as f64 / total_chars as f64
} else {
0.0
};
if readability > 0.8 {
readable_pdfs += 1;
println!(
" ✅ Text is readable ({}% standard chars)",
(readability * 100.0) as u32
);
} else {
println!(
" ⚠️ Text may be garbled ({}% standard chars)",
(readability * 100.0) as u32
);
}
}
}
Err(e) => {
println!(" ❌ Extraction failed: {}", e);
}
}
}
Err(e) => {
println!(" ❌ Could not get page count: {}", e);
}
}
}
Err(e) => {
println!(" ❌ Could not open PDF: {}", e);
}
}
}
println!("\n{}", "=".repeat(60));
println!("📈 SUMMARY RESULTS");
println!("{}", "=".repeat(60));
println!("📊 Total PDFs tested: {}", test_pdfs.len());
println!("✅ Successful extractions: {}", successful_extractions);
println!("📝 Total characters extracted: {}", total_chars_extracted);
println!(
"🔤 Readable PDFs: {} ({:.1}%)",
readable_pdfs,
if successful_extractions > 0 {
readable_pdfs as f64 / successful_extractions as f64 * 100.0
} else {
0.0
}
);
if successful_extractions > 0 {
println!(
"📏 Average chars per extraction: {}",
total_chars_extracted / successful_extractions
);
}
Ok(())
}