pdf_oxide 0.3.7

The Complete PDF Toolkit: extract, create, and edit PDFs. Rust core with bindings for Python, Node, WASM, Go, and more.
Documentation
[build-system]
requires = ["maturin>=1.0,<2.0"]
build-backend = "maturin"

[project]
name = "pdf_oxide"
version = "0.3.7"
description = "Fast Python PDF library for text extraction, markdown conversion, and document processing. Rust-powered, 2.1ms mean latency."
readme = "README.md"
requires-python = ">=3.8"
license = {text = "MIT OR Apache-2.0"}
authors = [
    {name = "PDF Oxide Contributors", email = "yfedoseev@gmail.com"},
]
keywords = ["pdf", "text-extraction", "pdf-parser", "pdf-library", "rag", "llm", "markdown", "document-parser", "pdf-to-text", "pdf-extraction", "fast-pdf", "data-extraction"]
classifiers = [
    "Development Status :: 4 - Beta",
    "Intended Audience :: Developers",
    "Intended Audience :: Science/Research",
    "License :: OSI Approved :: MIT License",
    "License :: OSI Approved :: Apache Software License",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3",
    "Programming Language :: Python :: 3.8",
    "Programming Language :: Python :: 3.9",
    "Programming Language :: Python :: 3.10",
    "Programming Language :: Python :: 3.11",
    "Programming Language :: Python :: 3.12",
    "Programming Language :: Python :: 3.13",
    "Programming Language :: Python :: 3.14",
    "Programming Language :: Rust",
    "Topic :: Software Development :: Libraries :: Python Modules",
    "Topic :: Text Processing",
    "Topic :: Text Processing :: General",
    "Topic :: Text Processing :: Markup :: HTML",
    "Topic :: Text Processing :: Markup :: Markdown",
    "Topic :: Scientific/Engineering :: Information Analysis",
]

[project.urls]
Homepage = "https://github.com/yfedoseev/pdf_oxide"
Documentation = "https://github.com/yfedoseev/pdf_oxide/blob/main/docs/getting-started-python.md"
"API Reference (Rust)" = "https://docs.rs/pdf_oxide"
Repository = "https://github.com/yfedoseev/pdf_oxide"
"Bug Tracker" = "https://github.com/yfedoseev/pdf_oxide/issues"

[dependency-groups]
dev = [
    # Linter and formatter (fast, all-in-one tool written in Rust)
    "ruff>=0.1.0",
    # Testing
    "pytest>=7.0",
    "pytest-cov>=4.0",
    # Type checking (optional)
    "mypy>=1.0",
    # Build tool for Python bindings
    "maturin>=1.0,<2.0",
]
test = [
    "pytest>=7.0.0",
    "pytest-cov>=4.0.0",
]
benchmark = [
    "pikepdf>=9.2.1",
    "borb>=3.0.4",
    "pymupdf>=1.24.11",
    "pymupdf4llm>=0.0.17",
    "pdfplumber>=0.11.5",
    "pypdf>=5.9.0",
    "pdfminer>=20191125",
    "pypdfium2>=5.3.0",
]
seg-edgar = [
    "sec-edgar-downloader>=5.0.3",
]
convert-models = [
    "torch>=2.4.1",
    "onnxruntime>=1.20.1",
    "transformers>=4.46.3",
]
inspect-tj-array = [
    "pymupdf>=1.24.11",
]
check-pdf-structure = [
    "pypdf",
]
analyze-pdf-spacing = [
    "pymupdf>=1.24.11",
]


[tool.maturin]
features = ["python"]
python-source = "python"
module-name = "pdf_oxide.pdf_oxide"

[tool.pytest.ini_options]
testpaths = ["tests"]
python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]

# Ruff - Fast Python linter and formatter (written in Rust)
# Replaces: Black, isort, Flake8, pyupgrade, and more
# Docs: https://docs.astral.sh/ruff/
[tool.ruff]
line-length = 100
target-version = "py38"
extend-exclude = [
    ".git",
    "__pycache__",
    "build",
    "dist",
    ".eggs",
    "*.egg-info",
]

[tool.ruff.lint]
# Enable these rule sets
select = [
    "E",      # pycodestyle errors
    "W",      # pycodestyle warnings
    "F",      # pyflakes
    "I",      # isort (import sorting)
    "N",      # pep8-naming
    "B",      # flake8-bugbear (find likely bugs)
    "C4",     # flake8-comprehensions
    "SIM",    # flake8-simplify
    "TCH",    # flake8-type-checking
]

# Disable specific rules if needed
ignore = [
    "E501",   # Line too long (handled by formatter)
    "B008",   # Do not perform function calls in argument defaults
    "UP",     # pyupgrade (modernize Python code) # compatible with py38
]

# Allow unused variables when underscore-prefixed
dummy-variable-rgx = "^(_+|(_+[a-zA-Z0-9_]*[a-zA-Z0-9]+?))$"

[tool.ruff.lint.per-file-ignores]
# Ignore import violations in __init__.py files
"__init__.py" = ["F401", "F403"]
# Allow print statements in scripts
"scripts/*.py" = ["T201"]

[tool.ruff.lint.isort]
known-first-party = ["pdf_oxide"]
force-single-line = false
lines-after-imports = 2

[tool.ruff.format]
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "lf"

[tool.ty.rules]
possibly-missing-attribute = "error"
possibly-missing-import = "error"
possibly-unresolved-reference = "warn"

[tool.ty.src]
# Linting codes in `scripts` needs lots of dependencies, so we don't include `scripts` here
# Instead, we could let .py files in `scripts` be checked when they are changed by pre-commit
include = ["python/pdf_oxide"]

[tool.ty.environment]
python = "./.venv"

[tool.pdm.scripts]
fmt = {composite = ["cargo fmt", "ruff check . --select I --fix", "ruff format ."]}
lint = {composite = ["cargo check", "cargo clippy -- -D warnings", "ruff check . --fix", "ty check ."]}