spectre_parse 1.0.0

Lazy PDF parser — xref-only at open(), objects materialize on demand. Read-only. Powers the spectre_pdf extraction crate.
Documentation
//! For every PDF that fails to open, print the error variant.
//! Used to prioritize what to fix next.

use spectre_parse::Document;
use std::collections::HashMap;
use std::fs;
use std::path::{Path, PathBuf};

fn main() {
    let arg = std::env::args().nth(1).expect("usage: diagnose_errors <dir>");
    let dir = Path::new(&arg);
    let pdfs: Vec<PathBuf> = if dir.is_dir() {
        let mut v: Vec<_> = fs::read_dir(dir)
            .unwrap()
            .filter_map(|e| e.ok())
            .map(|e| e.path())
            .filter(|p| p.extension().and_then(|e| e.to_str()) == Some("pdf"))
            .collect();
        v.sort();
        v
    } else {
        vec![dir.to_path_buf()]
    };

    let mut buckets: HashMap<String, Vec<String>> = HashMap::new();
    for p in &pdfs {
        let bytes = match fs::read(p) {
            Ok(b) => b,
            Err(_) => continue,
        };
        match Document::open(&bytes) {
            Ok(_) => continue,
            Err(e) => {
                let short = format!("{e}");
                let bucket = if short.len() > 70 {
                    short[..70].to_string()
                } else {
                    short
                };
                buckets
                    .entry(bucket)
                    .or_default()
                    .push(p.file_name().unwrap().to_string_lossy().into_owned());
            }
        }
    }

    let mut summary: Vec<_> = buckets.into_iter().collect();
    summary.sort_by_key(|(_, v)| std::cmp::Reverse(v.len()));
    for (msg, files) in summary {
        println!("[{}]  {}", files.len(), msg);
        for f in files.iter().take(3) {
            println!("    {f}");
        }
        if files.len() > 3 {
            println!("    ...");
        }
    }
}