The fastest Rust PDF library with text extraction: 0.8ms mean, 100% pass rate on 3,830 PDFs. 5× faster than pdf_extract, 17× faster than oxidize_pdf. Extract, create, and edit PDFs.
# Search for a term across all pages of a PDF and print matches.
# Run: python main.py document.pdf "query"
importsysfrompdf_oxideimportPdfDocumentdefmain():iflen(sys.argv)<3:print("Usage: python main.py <file.pdf> <query>",file=sys.stderr)sys.exit(1)path=sys.argv[1]query=sys.argv[2]doc=PdfDocument(path)pages=doc.page_countprint(f'Searching for "{query}" in {path} ({pages} pages)...\n')total=0pages_with_hits=0foriinrange(pages):results=doc.search(query,page_index=i)ifnotresults:continuepages_with_hits+=1print(f"Page {i+1}: {len(results)} match(es)")forrinresults:print(f' - "...{r.context}..."')total+=1print()print(f"Found {total} total matches across {pages_with_hits} pages.")if__name__=="__main__":main()