#!/usr/bin/env rust
#![allow(dead_code)]
#![allow(unused_variables)]
use pdf_oxide::{Error, PdfDocument};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::path::Path;
#[derive(Debug, Default)]
struct FeatureStats {
total_files: usize,
successful_parses: usize,
failed_parses: usize,
fonts_found: usize,
font_descriptors: usize,
missing_font_descriptors: usize,
encoding_found: usize,
to_unicode_cmaps: usize,
images_found: usize,
forms_found: usize,
annotations_found: usize,
xobjects_found: usize,
pages_found: usize,
total_pages: usize,
has_outline: usize,
has_metadata: usize,
flate_streams: usize,
lzw_streams: usize,
dct_streams: usize,
other_filters: HashSet<String>,
files_missing_features: HashMap<String, Vec<String>>,
}
fn analyze_pdf(path: &Path, stats: &mut FeatureStats) -> Result<(), Error> {
let filename = path.file_name().unwrap().to_string_lossy().to_string();
let mut doc = match PdfDocument::open(path) {
Ok(doc) => {
stats.successful_parses += 1;
doc
},
Err(e) => {
stats.failed_parses += 1;
println!("❌ {} - Failed to parse: {}", filename, e);
return Err(e);
},
};
let mut missing_features = Vec::new();
let page_count = match doc.page_count() {
Ok(count) => count,
Err(e) => {
missing_features.push(format!("Failed to get page count: {}", e));
0
},
};
if page_count > 0 {
stats.pages_found += 1;
stats.total_pages += page_count;
} else {
missing_features.push("No pages detected".to_string());
}
if page_count > 0 {
match doc.extract_text(0) {
Ok(text) => {
if !text.is_empty() {
stats.fonts_found += 1;
}
},
Err(_) => {
missing_features.push("Text extraction failed".to_string());
},
}
}
if page_count > 0 {
match doc.extract_images(0) {
Ok(images) => {
if !images.is_empty() {
stats.images_found += 1;
}
},
Err(_) => {
},
}
}
if !missing_features.is_empty() {
stats
.files_missing_features
.insert(filename.clone(), missing_features);
}
Ok(())
}
fn analyze_raw_pdf_structure(path: &Path, stats: &mut FeatureStats) {
let bytes = match fs::read(path) {
Ok(b) => b,
Err(_) => return,
};
let content = String::from_utf8_lossy(&bytes);
if content.contains("/FontDescriptor") {
stats.font_descriptors += 1;
}
if content.contains("/Encoding") {
stats.encoding_found += 1;
}
if content.contains("/ToUnicode") {
stats.to_unicode_cmaps += 1;
}
if content.contains("/Annot") {
stats.annotations_found += 1;
}
if content.contains("/AcroForm") || content.contains("/XFA") {
stats.forms_found += 1;
}
if content.contains("/XObject") {
stats.xobjects_found += 1;
}
if content.contains("/Outlines") {
stats.has_outline += 1;
}
if content.contains("/Metadata") {
stats.has_metadata += 1;
}
if content.contains("/FlateDecode") {
stats.flate_streams += 1;
}
if content.contains("/LZWDecode") {
stats.lzw_streams += 1;
}
if content.contains("/DCTDecode") {
stats.dct_streams += 1;
}
let filters = [
"ASCII85Decode",
"ASCIIHexDecode",
"RunLengthDecode",
"CCITTFaxDecode",
"JBIG2Decode",
"JPXDecode",
"Crypt",
];
for filter in &filters {
if content.contains(filter) {
stats.other_filters.insert(filter.to_string());
}
}
}
fn main() {
println!("=== PDF Feature Coverage Analysis ===\n");
let test_dir = Path::new("test_datasets/pdfs");
if !test_dir.exists() {
eprintln!("Error: test_datasets/pdfs directory not found");
std::process::exit(1);
}
let mut stats = FeatureStats::default();
let mut pdf_files = Vec::new();
for category in &["forms", "mixed", "technical"] {
let category_dir = test_dir.join(category);
if !category_dir.exists() {
continue;
}
if let Ok(entries) = fs::read_dir(&category_dir) {
for entry in entries.flatten() {
let path = entry.path();
if path.extension().map(|e| e == "pdf").unwrap_or(false) {
pdf_files.push(path);
}
}
}
}
stats.total_files = pdf_files.len();
println!("Found {} PDF files\n", stats.total_files);
for (i, path) in pdf_files.iter().enumerate() {
if (i + 1) % 10 == 0 {
println!("Progress: {}/{}", i + 1, stats.total_files);
}
let _ = analyze_pdf(path, &mut stats);
analyze_raw_pdf_structure(path, &mut stats);
}
println!("\n=== Results ===\n");
println!("## Parsing Success");
println!(
" Successful: {}/{} ({:.1}%)",
stats.successful_parses,
stats.total_files,
(stats.successful_parses as f64 / stats.total_files as f64) * 100.0
);
println!(
" Failed: {}/{} ({:.1}%)",
stats.failed_parses,
stats.total_files,
(stats.failed_parses as f64 / stats.total_files as f64) * 100.0
);
println!("\n## Page Detection");
println!(
" Files with pages: {}/{} ({:.1}%)",
stats.pages_found,
stats.successful_parses,
(stats.pages_found as f64 / stats.successful_parses.max(1) as f64) * 100.0
);
println!(" Total pages: {}", stats.total_pages);
println!(
" Avg pages/file: {:.1}",
stats.total_pages as f64 / stats.pages_found.max(1) as f64
);
println!("\n## Font Features (in raw PDF)");
println!(
" FontDescriptor: {}/{} ({:.1}%)",
stats.font_descriptors,
stats.total_files,
(stats.font_descriptors as f64 / stats.total_files as f64) * 100.0
);
println!(
" Encoding: {}/{} ({:.1}%)",
stats.encoding_found,
stats.total_files,
(stats.encoding_found as f64 / stats.total_files as f64) * 100.0
);
println!(
" ToUnicode CMap: {}/{} ({:.1}%)",
stats.to_unicode_cmaps,
stats.total_files,
(stats.to_unicode_cmaps as f64 / stats.total_files as f64) * 100.0
);
println!("\n## Content Features");
println!(
" Text extraction: {}/{} ({:.1}%)",
stats.fonts_found,
stats.pages_found,
(stats.fonts_found as f64 / stats.pages_found.max(1) as f64) * 100.0
);
println!(
" Images: {}/{} ({:.1}%)",
stats.images_found,
stats.total_files,
(stats.images_found as f64 / stats.total_files as f64) * 100.0
);
println!(
" Forms/AcroForm: {}/{} ({:.1}%)",
stats.forms_found,
stats.total_files,
(stats.forms_found as f64 / stats.total_files as f64) * 100.0
);
println!(
" Annotations: {}/{} ({:.1}%)",
stats.annotations_found,
stats.total_files,
(stats.annotations_found as f64 / stats.total_files as f64) * 100.0
);
println!(
" XObjects: {}/{} ({:.1}%)",
stats.xobjects_found,
stats.total_files,
(stats.xobjects_found as f64 / stats.total_files as f64) * 100.0
);
println!("\n## Structure Features");
println!(
" Outlines/Bookmarks: {}/{} ({:.1}%)",
stats.has_outline,
stats.total_files,
(stats.has_outline as f64 / stats.total_files as f64) * 100.0
);
println!(
" Metadata: {}/{} ({:.1}%)",
stats.has_metadata,
stats.total_files,
(stats.has_metadata as f64 / stats.total_files as f64) * 100.0
);
println!("\n## Compression Filters");
println!(
" FlateDecode: {}/{} ({:.1}%)",
stats.flate_streams,
stats.total_files,
(stats.flate_streams as f64 / stats.total_files as f64) * 100.0
);
println!(
" LZWDecode: {}/{} ({:.1}%)",
stats.lzw_streams,
stats.total_files,
(stats.lzw_streams as f64 / stats.total_files as f64) * 100.0
);
println!(
" DCTDecode (JPEG): {}/{} ({:.1}%)",
stats.dct_streams,
stats.total_files,
(stats.dct_streams as f64 / stats.total_files as f64) * 100.0
);
if !stats.other_filters.is_empty() {
println!(" Other filters found: {:?}", stats.other_filters);
}
if !stats.files_missing_features.is_empty() {
println!("\n## Files with Issues ({} files)", stats.files_missing_features.len());
for (file, issues) in stats.files_missing_features.iter().take(10) {
println!(" {}", file);
for issue in issues {
println!(" - {}", issue);
}
}
if stats.files_missing_features.len() > 10 {
println!(" ... and {} more", stats.files_missing_features.len() - 10);
}
}
println!("\n=== Feature Coverage Assessment ===\n");
let coverage_pct = (stats.successful_parses as f64 / stats.total_files as f64) * 100.0;
if coverage_pct >= 95.0 {
println!("✅ Excellent coverage ({:.1}%)", coverage_pct);
} else if coverage_pct >= 80.0 {
println!("⚠️ Good coverage ({:.1}%), some improvements possible", coverage_pct);
} else {
println!("❌ Coverage needs improvement ({:.1}%)", coverage_pct);
}
println!("\n## Features We Support:");
println!(" ✅ Basic PDF parsing (objects, streams, xref)");
println!(" ✅ Text extraction");
println!(" ✅ FlateDecode, LZWDecode, DCTDecode");
println!(" ✅ Font encoding and ToUnicode CMaps");
println!(" ✅ Image extraction");
println!(" ✅ XObject forms");
println!("\n## Features That May Need Attention:");
if stats.font_descriptors > stats.to_unicode_cmaps {
let diff = stats.font_descriptors - stats.to_unicode_cmaps;
println!(" ⚠️ {} files have FontDescriptor but no ToUnicode", diff);
println!(" (may need fallback encoding handling)");
}
if stats.annotations_found > 0 {
println!(" ℹ️ {} files have annotations", stats.annotations_found);
println!(" (currently not extracted in text output)");
}
if stats.has_outline > 0 {
println!(" ℹ️ {} files have outlines/bookmarks", stats.has_outline);
println!(" (currently not exposed in API)");
}
if !stats.other_filters.is_empty() {
println!(" ℹ️ Other compression filters found: {:?}", stats.other_filters);
println!(" (may or may not be supported)");
}
println!("\n=== Recommendation ===");
println!();
if coverage_pct >= 95.0 {
println!("The library has excellent coverage of the test dataset.");
println!("Current feature set is production-ready for v0.1.0.");
println!();
println!("Future enhancements (v1.x):");
println!(" - Annotation extraction (if users request it)");
println!(" - Outline/bookmark API (if users request it)");
println!(" - Additional compression filters (as needed)");
} else {
println!(
"Consider investigating the {} failed parses before release.",
stats.failed_parses
);
println!("These may represent edge cases or unsupported PDF features.");
}
}