[build-system]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"
[project]
name = "pdf_oxide"
version = "0.3.9"
description = "The fastest Python PDF library: 0.8ms mean, 5× faster than PyMuPDF. Text extraction, markdown conversion, PDF creation. 100% pass rate on 3,830 PDFs."
readme = "README.md"
requires-python = ">=3.8"
license = {text = "MIT OR Apache-2.0"}
authors = [
{name = "PDF Oxide Contributors", email = "yfedoseev@gmail.com"},
]
keywords = ["pdf", "text-extraction", "pdf-parser", "pymupdf-alternative", "rag", "llm", "ocr", "markdown", "document-parser", "pdf-to-text", "pdf-to-markdown", "data-extraction"]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Programming Language :: Python :: 3.14",
"Programming Language :: Rust",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Text Processing",
"Topic :: Text Processing :: General",
"Topic :: Text Processing :: Markup :: HTML",
"Topic :: Text Processing :: Markup :: Markdown",
"Topic :: Scientific/Engineering :: Information Analysis",
]
[project.urls]
Homepage = "https://github.com/yfedoseev/pdf_oxide"
Documentation = "https://pdf.oxide.fyi/docs/getting-started/python"
"API Reference (Rust)" = "https://docs.rs/pdf_oxide"
"API Reference (Python)" = "https://pdf.oxide.fyi/docs/reference/python-api"
"Performance Benchmarks" = "https://pdf.oxide.fyi/docs/performance"
Repository = "https://github.com/yfedoseev/pdf_oxide"
"Bug Tracker" = "https://github.com/yfedoseev/pdf_oxide/issues"
[dependency-groups]
dev = [
"ruff>=0.1.0",
"pytest>=7.0",
"pytest-cov>=4.0",
"mypy>=1.0",
"maturin>=1.0,<2.0",
]
test = [
"pytest>=7.0.0",
"pytest-cov>=4.0.0",
]
benchmark = [
"pikepdf>=9.2.1",
"borb>=3.0.4",
"pymupdf>=1.24.11",
"pymupdf4llm>=0.0.17",
"pdfplumber>=0.11.5",
"pypdf>=5.9.0",
"pdfminer>=20191125",
"pypdfium2>=5.3.0",
]
seg-edgar = [
"sec-edgar-downloader>=5.0.3",
]
convert-models = [
"torch>=2.4.1",
"onnxruntime>=1.20.1",
"transformers>=4.46.3",
]
inspect-tj-array = [
"pymupdf>=1.24.11",
]
check-pdf-structure = [
"pypdf",
]
analyze-pdf-spacing = [
"pymupdf>=1.24.11",
]
[tool.maturin]
features = ["python"]
python-source = "python"
module-name = "pdf_oxide.pdf_oxide"
[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
[tool.ruff]
line-length = 100
target-version = "py38"
extend-exclude = [
".git",
"__pycache__",
"build",
"dist",
".eggs",
"*.egg-info",
]
[tool.ruff.lint]
select = [
"E",
"W",
"F",
"I",
"N",
"B",
"C4",
"SIM",
"TCH",
]
ignore = [
"E501",
"B008",
"UP",
]
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["F401", "F403"]
"scripts/*.py" = ["T201"]
[tool.ruff.lint.isort]
known-first-party = ["pdf_oxide"]
force-single-line = false
lines-after-imports = 2
[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "lf"
[tool.ty.rules]
possibly-missing-attribute = "error"
possibly-missing-import = "error"
possibly-unresolved-reference = "warn"
[tool.ty.src]
include = ["python/pdf_oxide"]
[tool.ty.environment]
python = "./.venv"
[tool.pdm.scripts]
fmt = {composite = ["cargo fmt", "ruff check . --select I --fix", "ruff format ."]}
lint = {composite = ["cargo check", "cargo clippy -- -D warnings", "ruff check . --fix", "ty check ."]}