import logging
from pathlib import Path
import pytest
from pdf_oxide import PdfDocument
def test_open_pdf():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
assert doc is not None
version = doc.version()
assert isinstance(version, tuple)
assert len(version) == 2
assert isinstance(version[0], int)
assert isinstance(version[1], int)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_version():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
major, minor = doc.version()
assert major >= 1
assert minor >= 0
assert major <= 2
assert minor <= 7
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_page_count():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
count = doc.page_count()
assert isinstance(count, int)
assert count >= 1
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_text():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
text = doc.extract_text(0)
assert isinstance(text, str)
assert text is not None
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_text_with_content():
try:
doc = PdfDocument("tests/fixtures/hello_world.pdf")
text = doc.extract_text(0)
assert isinstance(text, str)
assert len(text) > 0
assert "hello" in text.lower()
except (OSError, RuntimeError):
pytest.skip("Test fixture 'hello_world.pdf' not available or invalid")
def test_to_markdown():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
markdown = doc.to_markdown(0)
assert isinstance(markdown, str)
assert markdown is not None
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_markdown_with_options():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
markdown = doc.to_markdown(0, detect_headings=True)
assert isinstance(markdown, str)
markdown = doc.to_markdown(0, detect_headings=False)
assert isinstance(markdown, str)
markdown = doc.to_markdown(0, preserve_layout=True)
assert isinstance(markdown, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_html():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
html = doc.to_html(0)
assert isinstance(html, str)
assert html is not None
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_html_semantic_mode():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
html = doc.to_html(0, preserve_layout=False)
assert isinstance(html, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_html_layout_mode():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
html = doc.to_html(0, preserve_layout=True)
assert isinstance(html, str)
if len(html) > 100:
assert "position" in html.lower() or "style" in html.lower()
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_markdown_all():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
markdown = doc.to_markdown_all()
assert isinstance(markdown, str)
assert markdown is not None
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_markdown_all_multipage():
try:
doc = PdfDocument("tests/fixtures/multipage.pdf")
markdown = doc.to_markdown_all()
assert isinstance(markdown, str)
assert len(markdown) > 0
page_count = doc.page_count()
if page_count > 1:
assert "---" in markdown
except OSError:
pytest.skip("Test fixture 'multipage.pdf' not available")
def test_to_html_all():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
html = doc.to_html_all()
assert isinstance(html, str)
assert html is not None
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_to_html_all_multipage():
try:
doc = PdfDocument("tests/fixtures/multipage.pdf")
html = doc.to_html_all()
assert isinstance(html, str)
assert len(html) > 0
page_count = doc.page_count()
if page_count > 1:
assert 'class="page"' in html or "data-page" in html
except OSError:
pytest.skip("Test fixture 'multipage.pdf' not available")
def test_open_pdf_pathlib():
try:
path = Path("tests/fixtures/1.pdf")
doc = PdfDocument(path)
assert doc is not None
assert doc.page_count() == 7, "1.pdf has 7 pages"
version = doc.version()
assert isinstance(version, tuple) and len(version) == 2 and version[0] >= 1
doc_str = PdfDocument("tests/fixtures/1.pdf")
assert doc.version() == doc_str.version()
assert doc.page_count() == doc_str.page_count()
assert doc.extract_text(0) == doc_str.extract_text(0)
assert len(doc.extract_text(0).strip()) > 0, "1.pdf has text on page 0"
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_context_manager():
try:
with PdfDocument("tests/fixtures/1.pdf") as doc:
assert doc is not None
assert doc.page_count() == 7, "1.pdf has 7 pages"
version = doc.version()
assert isinstance(version, tuple) and len(version) == 2 and version[0] >= 1
text = doc.extract_text(0)
assert isinstance(text, str) and len(text.strip()) > 0, "1.pdf has text"
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_context_manager_with_pathlib():
try:
with PdfDocument(Path("tests/fixtures/1.pdf")) as doc:
assert doc.page_count() == 7, "1.pdf has 7 pages"
version = doc.version()
assert isinstance(version, tuple) and len(version) == 2 and version[0] >= 1
_ = doc.to_markdown(0) except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_context_manager_exception_propagates():
try:
with pytest.raises(ValueError), PdfDocument("tests/fixtures/1.pdf") as doc:
_ = doc.page_count()
raise ValueError("intentional")
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_from_bytes_matches_file():
with open("tests/fixtures/simple.pdf", "rb") as f:
data = f.read()
doc_path = PdfDocument("tests/fixtures/simple.pdf")
doc_bytes = PdfDocument.from_bytes(data)
assert doc_path.version() == doc_bytes.version()
assert doc_path.page_count() == doc_bytes.page_count()
assert doc_path.extract_text(0) == doc_bytes.extract_text(0)
def test_from_bytes_roundtrip():
from pdf_oxide import Pdf
pdf = Pdf.from_text("Hello from bytes!")
pdf_bytes = pdf.to_bytes()
doc = PdfDocument.from_bytes(pdf_bytes)
assert doc.page_count() >= 1
text = doc.extract_text(0)
assert "Hello from bytes!" in text
def test_from_bytes_invalid():
with pytest.raises(IOError):
PdfDocument.from_bytes(b"not a pdf")
def test_error_handling_nonexistent_file():
with pytest.raises(IOError) as exc_info:
PdfDocument("nonexistent_file_that_does_not_exist.pdf")
error_msg = str(exc_info.value)
assert "Failed to open PDF" in error_msg or "No such file" in error_msg
def test_error_handling_invalid_page():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
page_count = doc.page_count()
with pytest.raises(RuntimeError) as exc_info:
doc.extract_text(page_count + 100)
error_msg = str(exc_info.value)
assert "Failed to extract text" in error_msg or "page" in error_msg.lower()
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_error_handling_invalid_page_conversion():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
page_count = doc.page_count()
with pytest.raises(RuntimeError):
doc.to_markdown(page_count + 100)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_repr():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
repr_str = repr(doc)
assert isinstance(repr_str, str)
assert "PdfDocument" in repr_str
assert "version=" in repr_str
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_multiple_operations():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
version1 = doc.version()
version2 = doc.version()
assert version1 == version2
text1 = doc.extract_text(0)
text2 = doc.extract_text(0)
assert text1 == text2
markdown = doc.to_markdown(0)
html = doc.to_html(0)
assert isinstance(markdown, str)
assert isinstance(html, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_image_output_dir():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
markdown = doc.to_markdown(0, image_output_dir="./test_images")
assert isinstance(markdown, str)
markdown = doc.to_markdown(0, include_images=False)
assert isinstance(markdown, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_all_options_combined():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
markdown = doc.to_markdown(
0,
preserve_layout=True,
detect_headings=False,
include_images=True,
image_output_dir="./output",
)
assert isinstance(markdown, str)
html = doc.to_html(
0,
preserve_layout=True,
detect_headings=True,
include_images=False,
image_output_dir=None,
)
assert isinstance(html, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_pdf_from_markdown():
from pdf_oxide import Pdf
md_content = """# Test Document
This is a **test** paragraph.
## Section 1
Some text content.
"""
pdf = Pdf.from_markdown(md_content)
assert pdf is not None
pdf_bytes = pdf.to_bytes()
assert isinstance(pdf_bytes, bytes)
assert len(pdf_bytes) > 0
assert pdf_bytes[:4] == b"%PDF"
def test_pdf_from_markdown_with_options():
from pdf_oxide import Pdf
md_content = "# Hello World"
pdf = Pdf.from_markdown(
md_content,
title="Test Title",
author="Test Author",
)
assert pdf is not None
pdf_bytes = pdf.to_bytes()
assert len(pdf_bytes) > 0
def test_pdf_from_html():
from pdf_oxide import Pdf
html_content = """
<h1>Test Document</h1>
<p>This is a <strong>test</strong> paragraph.</p>
"""
pdf = Pdf.from_html(html_content)
assert pdf is not None
pdf_bytes = pdf.to_bytes()
assert isinstance(pdf_bytes, bytes)
assert len(pdf_bytes) > 0
assert pdf_bytes[:4] == b"%PDF"
def test_pdf_from_text():
from pdf_oxide import Pdf
text_content = "Hello, World!\n\nThis is plain text."
pdf = Pdf.from_text(text_content)
assert pdf is not None
pdf_bytes = pdf.to_bytes()
assert len(pdf_bytes) > 0
assert pdf_bytes[:4] == b"%PDF"
def test_pdf_save_to_file(tmp_path):
from pdf_oxide import Pdf
pdf = Pdf.from_text("Test content")
output_path = tmp_path / "output.pdf"
pdf.save(str(output_path))
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_color_creation():
from pdf_oxide import Color
color = Color(1.0, 0.0, 0.0)
assert color is not None
color = Color.from_hex("#FF0000")
assert color is not None
color = Color.from_hex("00FF00")
assert color is not None
def test_color_predefined():
from pdf_oxide import Color
black = Color.black()
assert black is not None
white = Color.white()
assert white is not None
red = Color.red()
assert red is not None
green = Color.green()
assert green is not None
blue = Color.blue()
assert blue is not None
def test_blend_modes():
from pdf_oxide import BlendMode
assert BlendMode.NORMAL() is not None
assert BlendMode.MULTIPLY() is not None
assert BlendMode.SCREEN() is not None
assert BlendMode.OVERLAY() is not None
assert BlendMode.DARKEN() is not None
assert BlendMode.LIGHTEN() is not None
assert BlendMode.COLOR_DODGE() is not None
assert BlendMode.COLOR_BURN() is not None
assert BlendMode.HARD_LIGHT() is not None
assert BlendMode.SOFT_LIGHT() is not None
assert BlendMode.DIFFERENCE() is not None
assert BlendMode.EXCLUSION() is not None
def test_ext_gstate():
from pdf_oxide import BlendMode, ExtGState
gs = ExtGState().fill_alpha(0.5)
assert gs is not None
gs = ExtGState().fill_alpha(0.5).stroke_alpha(0.8).blend_mode(BlendMode.MULTIPLY())
assert gs is not None
def test_ext_gstate_presets():
from pdf_oxide import BlendMode, ExtGState
semi = ExtGState.semi_transparent()
assert semi is not None
multiply = ExtGState().blend_mode(BlendMode.MULTIPLY())
assert multiply is not None
screen = ExtGState().blend_mode(BlendMode.SCREEN())
assert screen is not None
def test_linear_gradient():
from pdf_oxide import Color, LinearGradient
gradient = (
LinearGradient()
.start(0.0, 0.0)
.end(100.0, 100.0)
.add_stop(0.0, Color.red())
.add_stop(1.0, Color.blue())
)
assert gradient is not None
def test_linear_gradient_presets():
from pdf_oxide import Color, LinearGradient
gradient = LinearGradient.horizontal(100.0, Color.black(), Color.white())
assert gradient is not None
gradient = LinearGradient.vertical(100.0, Color.black(), Color.white())
assert gradient is not None
gradient = LinearGradient().add_stop(0.0, Color.black()).add_stop(1.0, Color.white())
assert gradient is not None
def test_radial_gradient():
from pdf_oxide import Color, RadialGradient
gradient = (
RadialGradient()
.inner_circle(50.0, 50.0, 0.0)
.outer_circle(50.0, 50.0, 50.0)
.add_stop(0.0, Color.white())
.add_stop(1.0, Color.black())
)
assert gradient is not None
def test_radial_gradient_centered():
from pdf_oxide import RadialGradient
gradient = RadialGradient.centered(50.0, 50.0, 50.0)
assert gradient is not None
def test_line_cap():
from pdf_oxide import LineCap
assert LineCap.BUTT() is not None
assert LineCap.ROUND() is not None
assert LineCap.SQUARE() is not None
def test_line_join():
from pdf_oxide import LineJoin
assert LineJoin.MITER() is not None
assert LineJoin.ROUND() is not None
assert LineJoin.BEVEL() is not None
def test_pattern_presets():
from pdf_oxide import Color, PatternPresets
content = PatternPresets.horizontal_stripes(10.0, 10.0, 5.0, Color.red())
assert isinstance(content, bytes)
assert len(content) > 0
content = PatternPresets.vertical_stripes(10.0, 10.0, 5.0, Color.blue())
assert isinstance(content, bytes)
assert len(content) > 0
content = PatternPresets.checkerboard(10.0, Color.white(), Color.black())
assert isinstance(content, bytes)
assert len(content) > 0
content = PatternPresets.dots(10.0, 2.0, Color.red())
assert isinstance(content, bytes)
assert len(content) > 0
content = PatternPresets.diagonal_lines(10.0, 0.5, Color.black())
assert isinstance(content, bytes)
assert len(content) > 0
content = PatternPresets.crosshatch(10.0, 0.5, Color.black())
assert isinstance(content, bytes)
assert len(content) > 0
def test_extract_images():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
images = doc.extract_images(0)
assert isinstance(images, list)
for img in images:
assert isinstance(img, dict)
assert "width" in img
assert "height" in img
assert "color_space" in img
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_spans():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
spans = doc.extract_spans(0)
assert isinstance(spans, list)
for span in spans:
assert hasattr(span, "text")
assert hasattr(span, "bbox")
assert hasattr(span, "font_name")
assert hasattr(span, "font_size")
assert hasattr(span, "is_bold")
assert hasattr(span, "is_italic")
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_spans_repr():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
spans = doc.extract_spans(0)
if spans:
r = repr(spans[0])
assert "TextSpan" in r
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_get_outline():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
outline = doc.get_outline()
assert outline is None or isinstance(outline, list)
if outline:
for item in outline:
assert isinstance(item, dict)
assert "title" in item
assert "children" in item
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_get_annotations():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
annotations = doc.get_annotations(0)
assert isinstance(annotations, list)
for ann in annotations:
assert isinstance(ann, dict)
assert "subtype" in ann
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_paths():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
paths = doc.extract_paths(0)
assert isinstance(paths, list)
for path in paths:
assert isinstance(path, dict)
assert "bbox" in path
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_paths_operations():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
paths = doc.extract_paths(0)
assert isinstance(paths, list)
assert len(paths) > 0, "Expected at least one path"
for path in paths:
assert "operations" in path, "Path dict should contain 'operations' field"
assert isinstance(path["operations"], list)
assert len(path["operations"]) == path["operations_count"]
for op in path["operations"]:
assert isinstance(op, dict)
assert "op" in op, "Each operation should have an 'op' field"
op_type = op["op"]
assert op_type in ("move_to", "line_to", "curve_to", "rectangle", "close_path")
if op_type in ("move_to", "line_to"):
assert "x" in op and "y" in op
elif op_type == "curve_to":
assert all(k in op for k in ("cx1", "cy1", "cx2", "cy2", "x", "y"))
elif op_type == "rectangle":
assert all(k in op for k in ("x", "y", "width", "height"))
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_images_invalid_page():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
with pytest.raises(RuntimeError):
doc.extract_images(999)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_spans_invalid_page():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
with pytest.raises(RuntimeError):
doc.extract_spans(999)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_get_annotations_invalid_page():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
with pytest.raises(RuntimeError):
doc.get_annotations(999)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_paths_invalid_page():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
with pytest.raises(RuntimeError):
doc.extract_paths(999)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_extract_image_bytes_empty():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
result = doc.extract_image_bytes(0)
assert isinstance(result, list)
for img in result:
assert isinstance(img, dict)
assert "width" in img
assert "height" in img
assert "format" in img
assert "data" in img
assert isinstance(img["data"], bytes)
assert img["format"] == "png"
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_pdf_from_image_bytes():
from pdf_oxide import Pdf
png_data = _create_minimal_png()
pdf = Pdf.from_image_bytes(png_data)
assert pdf is not None
pdf_bytes = pdf.to_bytes()
assert len(pdf_bytes) > 0
assert pdf_bytes[:4] == b"%PDF"
def test_pdf_from_image(tmp_path):
from pdf_oxide import Pdf
img_path = tmp_path / "test.jpg"
img_path.write_bytes(_create_minimal_png())
pdf = Pdf.from_image(str(img_path))
assert pdf is not None
assert len(pdf.to_bytes()) > 0
def test_pdf_from_images(tmp_path):
from pdf_oxide import Pdf
img1 = tmp_path / "img1.jpg"
img2 = tmp_path / "img2.jpg"
img1.write_bytes(_create_minimal_png())
img2.write_bytes(_create_minimal_png())
pdf = Pdf.from_images([str(img1), str(img2)])
assert pdf is not None
assert len(pdf.to_bytes()) > 0
def _create_minimal_png():
return bytes(
[
0xFF,
0xD8,
0xFF,
0xE0,
0x00,
0x10,
0x4A,
0x46,
0x49,
0x46,
0x00,
0x01,
0x01,
0x00,
0x00,
0x01,
0x00,
0x01,
0x00,
0x00,
0xFF,
0xDB,
0x00,
0x43,
0x00,
0x08,
0x06,
0x06,
0x07,
0x06,
0x05,
0x08,
0x07,
0x07,
0x07,
0x09,
0x09,
0x08,
0x0A,
0x0C,
0x14,
0x0D,
0x0C,
0x0B,
0x0B,
0x0C,
0x19,
0x12,
0x13,
0x0F,
0x14,
0x1D,
0x1A,
0x1F,
0x1E,
0x1D,
0x1A,
0x1C,
0x1C,
0x20,
0x24,
0x2E,
0x27,
0x20,
0x22,
0x2C,
0x23,
0x1C,
0x1C,
0x28,
0x37,
0x29,
0x2C,
0x30,
0x31,
0x34,
0x34,
0x34,
0x1F,
0x27,
0x39,
0x3D,
0x38,
0x32,
0x3C,
0x2E,
0x33,
0x34,
0x32,
0xFF,
0xC0,
0x00,
0x0B,
0x08,
0x00,
0x01,
0x00,
0x01,
0x01,
0x01,
0x11,
0x00,
0xFF,
0xC4,
0x00,
0x1F,
0x00,
0x00,
0x01,
0x05,
0x01,
0x01,
0x01,
0x01,
0x01,
0x01,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x00,
0x01,
0x02,
0x03,
0x04,
0x05,
0x06,
0x07,
0x08,
0x09,
0x0A,
0x0B,
0xFF,
0xC4,
0x00,
0xB5,
0x10,
0x00,
0x02,
0x01,
0x03,
0x03,
0x02,
0x04,
0x03,
0x05,
0x05,
0x04,
0x04,
0x00,
0x00,
0x01,
0x7D,
0x01,
0x02,
0x03,
0x00,
0x04,
0x11,
0x05,
0x12,
0x21,
0x31,
0x41,
0x06,
0x13,
0x51,
0x61,
0x07,
0x22,
0x71,
0x14,
0x32,
0x81,
0x91,
0xA1,
0x08,
0x23,
0x42,
0xB1,
0xC1,
0x15,
0x52,
0xD1,
0xF0,
0x24,
0x33,
0x62,
0x72,
0x82,
0x09,
0x0A,
0x16,
0x17,
0x18,
0x19,
0x1A,
0x25,
0x26,
0x27,
0x28,
0x29,
0x2A,
0x34,
0x35,
0x36,
0x37,
0x38,
0x39,
0x3A,
0x43,
0x44,
0x45,
0x46,
0x47,
0x48,
0x49,
0x4A,
0x53,
0x54,
0x55,
0x56,
0x57,
0x58,
0x59,
0x5A,
0x63,
0x64,
0x65,
0x66,
0x67,
0x68,
0x69,
0x6A,
0x73,
0x74,
0x75,
0x76,
0x77,
0x78,
0x79,
0x7A,
0x83,
0x84,
0x85,
0x86,
0x87,
0x88,
0x89,
0x8A,
0x92,
0x93,
0x94,
0x95,
0x96,
0x97,
0x98,
0x99,
0x9A,
0xA2,
0xA3,
0xA4,
0xA5,
0xA6,
0xA7,
0xA8,
0xA9,
0xAA,
0xB2,
0xB3,
0xB4,
0xB5,
0xB6,
0xB7,
0xB8,
0xB9,
0xBA,
0xC2,
0xC3,
0xC4,
0xC5,
0xC6,
0xC7,
0xC8,
0xC9,
0xCA,
0xD2,
0xD3,
0xD4,
0xD5,
0xD6,
0xD7,
0xD8,
0xD9,
0xDA,
0xE1,
0xE2,
0xE3,
0xE4,
0xE5,
0xE6,
0xE7,
0xE8,
0xE9,
0xEA,
0xF1,
0xF2,
0xF3,
0xF4,
0xF5,
0xF6,
0xF7,
0xF8,
0xF9,
0xFA,
0xFF,
0xDA,
0x00,
0x08,
0x01,
0x01,
0x00,
0x00,
0x3F,
0x00,
0xFB,
0xD5,
0xDB,
0x20,
0xA8,
0xF9,
0xFF,
0xD9,
]
)
def test_flatten_forms():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
doc.flatten_forms()
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_merge_from_bytes():
from pdf_oxide import Pdf
pdf1 = Pdf.from_text("Page 1")
pdf2 = Pdf.from_text("Page 2")
import os
import tempfile
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f:
tmp_path = f.name
try:
pdf1.save(tmp_path)
doc = PdfDocument(tmp_path)
count = doc.merge_from(pdf2.to_bytes())
assert count == 1, "Should merge 1 page"
finally:
os.unlink(tmp_path)
def test_embed_file():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
doc.embed_file("readme.txt", b"Hello embedded file")
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_page_labels():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
labels = doc.page_labels()
assert isinstance(labels, list)
for label in labels:
assert isinstance(label, dict)
assert "start_page" in label
assert "style" in label
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
def test_xmp_metadata():
try:
doc = PdfDocument("tests/fixtures/simple.pdf")
metadata = doc.xmp_metadata()
assert metadata is None or isinstance(metadata, dict)
except (OSError, RuntimeError):
pytest.skip("Test fixture 'simple.pdf' not available or invalid")
class _PdfOxideLogCapture(logging.Handler):
def __init__(self):
super().__init__(level=logging.DEBUG)
self.records = []
def emit(self, record: logging.LogRecord) -> None:
self.records.append(record)
def _capture_pdf_oxide_logs():
logger = logging.getLogger("pdf_oxide")
handler = _PdfOxideLogCapture()
logger.addHandler(handler)
prev_level = logger.level
prev_propagate = logger.propagate
logger.setLevel(logging.DEBUG)
logger.propagate = False
return logger, handler, prev_level, prev_propagate
def test_log_level_issue_283_regression():
import pdf_oxide
try:
doc = PdfDocument("tests/fixtures/1.pdf")
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
prev_rust_level = pdf_oxide.get_log_level()
logger, handler, prev_level, prev_propagate = _capture_pdf_oxide_logs()
try:
pdf_oxide.set_log_level("debug")
for page in range(doc.page_count()):
doc.extract_text(page)
debug_records = [r for r in handler.records if r.levelno == logging.DEBUG]
assert debug_records, (
"expected at least one DEBUG record from pdf_oxide at DEBUG level — "
"if this fails, the pyo3_log bridge is broken and the suppression "
"assertion below would pass vacuously"
)
handler.records.clear()
pdf_oxide.set_log_level("error")
for page in range(doc.page_count()):
doc.extract_text(page)
leaked = [r for r in handler.records if r.levelno < logging.ERROR]
assert not leaked, (
f"DEBUG/TRACE/INFO/WARN records leaked at ERROR level (regression "
f"of #283): {[(r.name, r.levelname, r.getMessage()) for r in leaked[:5]]}"
)
finally:
pdf_oxide.set_log_level(prev_rust_level)
logger.removeHandler(handler)
logger.setLevel(prev_level)
logger.propagate = prev_propagate
def test_extract_words_basic():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
words = doc.extract_words(0)
assert isinstance(words, list)
assert len(words) > 0
for w in words:
assert hasattr(w, "text")
assert hasattr(w, "bbox")
assert isinstance(w.text, str)
assert len(w.text) > 0
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_words_with_threshold():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
words_default = doc.extract_words(0)
words_tight = doc.extract_words(0, word_gap_threshold=0.5)
assert isinstance(words_tight, list)
assert len(words_tight) > 0
assert len(words_tight) >= len(words_default)
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_words_with_region_and_threshold():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
words = doc.extract_words(0, region=(0, 0, 300, 400), word_gap_threshold=2.0)
assert isinstance(words, list)
all_words = doc.extract_words(0, word_gap_threshold=2.0)
assert len(words) <= len(all_words)
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_text_lines_basic():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
lines = doc.extract_text_lines(0)
assert isinstance(lines, list)
assert len(lines) > 0
for line in lines:
assert hasattr(line, "text")
assert hasattr(line, "bbox")
assert isinstance(line.text, str)
assert len(line.text) > 0
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_text_lines_with_thresholds():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
lines = doc.extract_text_lines(0, word_gap_threshold=2.0, line_gap_threshold=5.0)
assert isinstance(lines, list)
assert len(lines) > 0
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_page_layout_params():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
params = doc.page_layout_params(0)
assert hasattr(params, "word_gap_threshold")
assert hasattr(params, "line_gap_threshold")
assert hasattr(params, "median_char_width")
assert hasattr(params, "median_font_size")
assert hasattr(params, "median_line_spacing")
assert hasattr(params, "column_count")
assert params.word_gap_threshold > 0
assert params.line_gap_threshold > 0
assert params.median_char_width > 0
assert params.median_font_size > 0
r = repr(params)
assert "LayoutParams" in r
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_page_layout_params_invalid_page():
try:
doc = PdfDocument("tests/fixtures/1.pdf")
with pytest.raises(RuntimeError):
doc.page_layout_params(9999)
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extraction_profile_inspect():
from pdf_oxide import ExtractionProfile
profile = ExtractionProfile.form()
assert profile.name == "Form"
assert isinstance(profile.tj_offset_threshold, float)
assert isinstance(profile.word_margin_ratio, float)
assert isinstance(profile.space_threshold_em_ratio, float)
assert isinstance(profile.space_char_multiplier, float)
assert isinstance(profile.use_adaptive_threshold, bool)
r = repr(profile)
assert "ExtractionProfile" in r
assert "Form" in r
def test_extraction_profile_available():
from pdf_oxide import ExtractionProfile
names = ExtractionProfile.available()
assert isinstance(names, list)
assert len(names) >= 9
assert "Form" in names
assert "Academic" in names
def test_extraction_profile_all_constructors():
from pdf_oxide import ExtractionProfile
constructors = [
ExtractionProfile.conservative,
ExtractionProfile.aggressive,
ExtractionProfile.balanced,
ExtractionProfile.academic,
ExtractionProfile.policy,
ExtractionProfile.form,
ExtractionProfile.government,
ExtractionProfile.scanned_ocr,
ExtractionProfile.adaptive,
]
for ctor in constructors:
profile = ctor()
assert isinstance(profile.name, str)
assert len(profile.name) > 0
def test_extract_words_with_profile():
from pdf_oxide import ExtractionProfile
try:
doc = PdfDocument("tests/fixtures/1.pdf")
profile = ExtractionProfile.form()
words = doc.extract_words(0, profile=profile)
assert isinstance(words, list)
assert len(words) > 0
for w in words:
assert isinstance(w.text, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_text_lines_with_profile():
from pdf_oxide import ExtractionProfile
try:
doc = PdfDocument("tests/fixtures/1.pdf")
profile = ExtractionProfile.academic()
lines = doc.extract_text_lines(0, profile=profile)
assert isinstance(lines, list)
assert len(lines) > 0
for line in lines:
assert isinstance(line.text, str)
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_words_profile_and_threshold():
from pdf_oxide import ExtractionProfile
try:
doc = PdfDocument("tests/fixtures/1.pdf")
profile = ExtractionProfile.aggressive()
words = doc.extract_words(0, word_gap_threshold=1.5, profile=profile)
assert isinstance(words, list)
assert len(words) > 0
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")
def test_extract_text_lines_profile_and_thresholds():
from pdf_oxide import ExtractionProfile
try:
doc = PdfDocument("tests/fixtures/1.pdf")
profile = ExtractionProfile.policy()
lines = doc.extract_text_lines(
0,
word_gap_threshold=2.0,
line_gap_threshold=5.0,
profile=profile,
)
assert isinstance(lines, list)
assert len(lines) > 0
except (OSError, RuntimeError):
pytest.skip("Test fixture '1.pdf' not available or invalid")