pdf_oxide 0.3.22

The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
Documentation
#!/usr/bin/env python3
"""Convert PDF specification to markdown."""

import pymupdf4llm


print("Converting PDF specification to markdown...")
print("This may take a few minutes for a 750+ page document...")

try:
    md_text = pymupdf4llm.to_markdown(
        "docs/spec/PDF32000_2008.pdf",
        page_chunks=False,  # Single document
    )

    with open("docs/spec/pdf.md", "w", encoding="utf-8") as f:
        f.write(md_text)

    print("✅ Conversion complete!")
    print("   Output: docs/spec/pdf.md")
    print(f"   Size: {len(md_text):,} characters")

except Exception as e:
    print(f"❌ Error: {e}")
    import traceback

    traceback.print_exc()