import json
import os
import subprocess
import sys
import tempfile
import time
import difflib
import pdfplumber
BENCH_DIR = os.path.dirname(__file__)
PDF_DIR = os.path.join(BENCH_DIR, "pdfs")
RUST_BIN = os.path.join(os.path.dirname(BENCH_DIR), "target", "release", "pdfsink-rs")
TEST_PDFS = ["simple_text.pdf", "table_lines.pdf", "irs_w9.pdf", "un_charter.pdf", "census_table.pdf"]
PASS = "PASS"
FAIL = "FAIL"
SKIP = "SKIP"
def timed_run(cmd, timeout=60):
t0 = time.perf_counter()
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=timeout)
elapsed = time.perf_counter() - t0
return elapsed, result.stdout, result.stderr, result.returncode
except subprocess.TimeoutExpired:
return timeout, "", "TIMEOUT", -1
def rust_cmd(command, pdf, *extra):
return [RUST_BIN, command, os.path.join(PDF_DIR, pdf)] + list(extra)
def similarity(a, b):
if not a and not b:
return 1.0
if not a or not b:
return 0.0
return difflib.SequenceMatcher(None, a, b).ratio()
def strip_ws(s):
return "".join(s.split())
def fmt_time(s):
if s < 0.001:
return f"{s*1e6:.0f}us"
if s < 1:
return f"{s*1000:.1f}ms"
return f"{s:.2f}s"
def speedup_str(rust_s, py_s):
if rust_s == 0 or py_s == 0:
return "N/A"
return f"{py_s/rust_s:.1f}x"
results = []
def record(pdf, command, status, rust_time, py_time, detail=""):
results.append({
"pdf": pdf,
"command": command,
"status": status,
"rust_time": rust_time,
"py_time": py_time,
"detail": detail,
})
def test_info(pdf):
rt, r_out, _, rc = timed_run(rust_cmd("info", pdf))
if rc != 0:
record(pdf, "info", FAIL, rt, 0, f"rust exit code {rc}")
return
t0 = time.perf_counter()
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
py_time = time.perf_counter() - t0
r_data = json.loads(r_out)
if r_data["page_count"] != len(p.pages):
record(pdf, "info/page_count", FAIL, rt, py_time,
f"rust={r_data['page_count']} py={len(p.pages)}")
else:
record(pdf, "info/page_count", PASS, rt, py_time,
f"{r_data['page_count']} pages")
for i, rp in enumerate(r_data["pages"]):
pp = p.pages[i]
w_ok = abs(rp["width"] - float(pp.width)) < 0.1
h_ok = abs(rp["height"] - float(pp.height)) < 0.1
rot_ok = rp["rotation"] == (pp.rotation or 0)
if w_ok and h_ok and rot_ok:
record(pdf, f"info/page{i+1}_dims", PASS, 0, 0,
f"{rp['width']}x{rp['height']} rot={rp['rotation']}")
else:
record(pdf, f"info/page{i+1}_dims", FAIL, 0, 0,
f"rust={rp['width']}x{rp['height']}r{rp['rotation']} py={pp.width}x{pp.height}r{pp.rotation}")
if i >= 2: break
p.close()
def test_text(pdf):
rt, r_out, _, rc = timed_run(rust_cmd("text", pdf, "1"))
if rc != 0:
record(pdf, "text", FAIL, rt, 0, f"rust exit code {rc}")
return
t0 = time.perf_counter()
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
py_text = p.pages[0].extract_text() or ""
py_time = time.perf_counter() - t0
r_text = r_out.rstrip("\n")
sim = similarity(r_text, py_text)
content_sim = similarity(strip_ws(r_text), strip_ws(py_text))
status = PASS if sim > 0.95 else FAIL
record(pdf, "text", status, rt, py_time,
f"sim={sim:.1%} content_sim={content_sim:.1%} rust_len={len(r_text)} py_len={len(py_text)}")
p.close()
def test_words(pdf):
rt, r_out, _, rc = timed_run(rust_cmd("words", pdf, "1"))
if rc != 0:
record(pdf, "words", FAIL, rt, 0, f"rust exit code {rc}")
return
t0 = time.perf_counter()
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
py_words = p.pages[0].extract_words()
py_time = time.perf_counter() - t0
r_words = json.loads(r_out)
count_match = len(r_words) == len(py_words)
r_texts = [w["text"] for w in r_words[:10]]
p_texts = [w["text"] for w in py_words[:10]]
pos_ok = True
pos_detail = []
for i in range(min(5, len(r_words), len(py_words))):
rw, pw = r_words[i], py_words[i]
if rw["text"] == pw["text"]:
x0_ok = abs(rw["x0"] - pw["x0"]) < 1.0
top_ok = abs(rw["top"] - pw["top"]) < 1.0
if not (x0_ok and top_ok):
pos_ok = False
pos_detail.append(f"word '{rw['text']}': rust=({rw['x0']:.1f},{rw['top']:.1f}) py=({pw['x0']:.1f},{pw['top']:.1f})")
status = PASS if count_match else FAIL
detail = f"rust={len(r_words)} py={len(py_words)} texts_match={r_texts==p_texts} pos_ok={pos_ok}"
if pos_detail:
detail += " " + "; ".join(pos_detail[:3])
record(pdf, "words", status, rt, py_time, detail)
p.close()
def test_search(pdf):
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
text = p.pages[0].extract_text() or ""
search_word = None
for w in text.split():
if len(w) >= 4 and w.isalpha():
search_word = w
break
if not search_word:
record(pdf, "search", SKIP, 0, 0, "no suitable search term")
p.close()
return
rt, r_out, _, rc = timed_run(rust_cmd("search", pdf, "1", search_word))
if rc != 0:
record(pdf, "search", FAIL, rt, 0, f"rust exit code {rc}")
p.close()
return
t0 = time.perf_counter()
py_matches = p.pages[0].search(search_word)
py_time = time.perf_counter() - t0
r_matches = json.loads(r_out)
count_match = len(r_matches) == len(py_matches)
status = PASS if count_match else FAIL
detail = f"pattern='{search_word}' rust={len(r_matches)} py={len(py_matches)}"
if r_matches and py_matches:
r0, p0 = r_matches[0], py_matches[0]
x0_ok = abs(r0["x0"] - p0["x0"]) < 2.0
top_ok = abs(r0["top"] - p0["top"]) < 2.0
detail += f" first_pos_ok={x0_ok and top_ok}"
record(pdf, "search", status, rt, py_time, detail)
p.close()
def test_objects(pdf):
rt, r_out, _, rc = timed_run(rust_cmd("objects", pdf, "1"))
if rc != 0:
record(pdf, "objects", FAIL, rt, 0, f"rust exit code {rc}")
return
t0 = time.perf_counter()
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
page = p.pages[0]
py_counts = {
"chars": len(page.chars),
"lines": len(page.lines),
"rects": len(page.rects),
"curves": len(page.curves),
"images": len(page.images),
}
py_time = time.perf_counter() - t0
r_data = json.loads(r_out)
r_counts = {
"chars": len(r_data.get("chars", [])),
"lines": len(r_data.get("lines", [])),
"rects": len(r_data.get("rects", [])),
"curves": len(r_data.get("curves", [])),
"images": len(r_data.get("images", [])),
}
mismatches = []
for key in r_counts:
if r_counts[key] != py_counts[key]:
mismatches.append(f"{key}:{r_counts[key]}vs{py_counts[key]}")
char_pos_ok = True
if r_data.get("chars") and page.chars:
for i in range(min(5, len(r_data["chars"]), len(page.chars))):
rc_char = r_data["chars"][i]
pc_char = page.chars[i]
if rc_char["text"] != pc_char["text"]:
char_pos_ok = False
break
if abs(rc_char["x0"] - pc_char["x0"]) > 0.5:
char_pos_ok = False
break
r_annots = len(r_data.get("annots", []))
r_links = len(r_data.get("hyperlinks", []))
py_annots = len(page.annots) if hasattr(page, 'annots') else 0
py_links = len(page.hyperlinks) if hasattr(page, 'hyperlinks') else 0
status = PASS if not mismatches else FAIL
detail = f"counts={'match' if not mismatches else ','.join(mismatches)} chars_pos={char_pos_ok} annots:{r_annots}vs{py_annots} links:{r_links}vs{py_links}"
record(pdf, "objects", status, rt, py_time, detail)
p.close()
def test_links(pdf):
rt, r_out, _, rc = timed_run(rust_cmd("links", pdf, "1"))
if rc != 0:
record(pdf, "links", FAIL, rt, 0, f"rust exit code {rc}")
return
t0 = time.perf_counter()
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
py_links = p.pages[0].hyperlinks if hasattr(p.pages[0], 'hyperlinks') else []
py_time = time.perf_counter() - t0
r_links = json.loads(r_out)
count_match = len(r_links) == len(py_links)
r_uris = sorted([l.get("uri", "") for l in r_links])
p_uris = sorted([l.get("uri", "") for l in py_links])
uri_match = r_uris == p_uris
status = PASS if count_match else FAIL
detail = f"rust={len(r_links)} py={len(py_links)} uris_match={uri_match}"
record(pdf, "links", status, rt, py_time, detail)
p.close()
def test_table(pdf):
rt, r_out, _, rc = timed_run(rust_cmd("table", pdf, "1", "lines"))
if rc != 0:
record(pdf, "table/lines", FAIL, rt, 0, f"rust exit code {rc}")
return
t0 = time.perf_counter()
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
py_table = p.pages[0].extract_table()
py_time = time.perf_counter() - t0
r_table = json.loads(r_out)
if r_table is None and py_table is None:
record(pdf, "table/lines", PASS, rt, py_time, "both null")
elif r_table is None or py_table is None:
record(pdf, "table/lines", FAIL, rt, py_time,
f"rust={'null' if r_table is None else f'{len(r_table)}rows'} py={'null' if py_table is None else f'{len(py_table)}rows'}")
else:
rows_match = len(r_table) == len(py_table)
cols_match = (len(r_table[0]) if r_table else 0) == (len(py_table[0]) if py_table else 0)
data_sim = 1.0
if r_table and py_table:
r_flat = " ".join(str(c) for row in r_table for c in row)
p_flat = " ".join(str(c) for row in py_table for c in row)
data_sim = similarity(strip_ws(r_flat), strip_ws(p_flat))
status = PASS if rows_match and cols_match else FAIL
r_dims = f"{len(r_table)}x{len(r_table[0]) if r_table else 0}"
p_dims = f"{len(py_table)}x{len(py_table[0]) if py_table else 0}"
record(pdf, "table/lines", status, rt, py_time,
f"rust={r_dims} py={p_dims} content_sim={data_sim:.1%}")
p.close()
rt2, r_out2, _, rc2 = timed_run(rust_cmd("table", pdf, "1", "text"))
t0 = time.perf_counter()
p2 = pdfplumber.open(os.path.join(PDF_DIR, pdf))
py_table2 = p2.pages[0].extract_table({"vertical_strategy": "text", "horizontal_strategy": "text"})
py_time2 = time.perf_counter() - t0
r_table2 = json.loads(r_out2) if rc2 == 0 else None
if r_table2 is None and py_table2 is None:
record(pdf, "table/text", PASS, rt2, py_time2, "both null")
elif r_table2 is None or py_table2 is None:
record(pdf, "table/text", FAIL, rt2, py_time2,
f"rust={'null' if r_table2 is None else 'found'} py={'null' if py_table2 is None else 'found'}")
else:
rows_match = len(r_table2) == len(py_table2)
status = PASS if rows_match else FAIL
record(pdf, "table/text", status, rt2, py_time2,
f"rust={len(r_table2)}rows py={len(py_table2)}rows")
p2.close()
def test_svg(pdf):
with tempfile.NamedTemporaryFile(suffix=".svg", delete=False) as tmp:
svg_path = tmp.name
rt, _, r_err, rc = timed_run(rust_cmd("svg", pdf, "1", svg_path))
if rc != 0:
record(pdf, "svg", FAIL, rt, 0, f"rust exit code {rc}: {r_err}")
return
try:
svg_content = open(svg_path).read()
has_svg_tag = "<svg" in svg_content
has_viewbox = "viewBox" in svg_content
size = len(svg_content)
status = PASS if has_svg_tag and has_viewbox else FAIL
record(pdf, "svg", status, rt, 0,
f"size={size}bytes has_svg={has_svg_tag} has_viewbox={has_viewbox}")
finally:
os.unlink(svg_path)
def test_multipage(pdf):
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
if len(p.pages) < 2:
p.close()
return
for pg in [1, min(2, len(p.pages))]:
rt, r_out, _, rc = timed_run(rust_cmd("text", pdf, str(pg)))
if rc != 0:
record(pdf, f"text/page{pg}", FAIL, rt, 0, f"exit code {rc}")
continue
t0 = time.perf_counter()
py_text = p.pages[pg - 1].extract_text() or ""
py_time = time.perf_counter() - t0
sim = similarity(r_out.rstrip("\n"), py_text)
status = PASS if sim > 0.95 else FAIL
record(pdf, f"text/page{pg}", status, rt, py_time, f"sim={sim:.1%}")
p.close()
def test_crop(pdf):
p = pdfplumber.open(os.path.join(PDF_DIR, pdf))
page = p.pages[0]
bbox = (0, 0, float(page.width) / 2, float(page.height) / 2)
t0 = time.perf_counter()
cropped = page.crop(bbox)
py_text = cropped.extract_text() or ""
py_time = time.perf_counter() - t0
record(pdf, "crop/top-left", PASS if py_text else SKIP, 0, py_time,
f"py_text_len={len(py_text)} (library-only, no CLI equivalent)")
p.close()
def main():
if not os.path.exists(RUST_BIN):
print(f"ERROR: {RUST_BIN} not found. Run: cargo build --release")
sys.exit(1)
available_pdfs = [f for f in TEST_PDFS if os.path.exists(os.path.join(PDF_DIR, f))]
print(f"CLI Benchmark: pdfsink-rs vs pdfplumber {pdfplumber.__version__}")
print(f"Testing {len(available_pdfs)} PDFs x 10 command categories")
print("=" * 90)
for pdf in available_pdfs:
print(f"\n {pdf}:")
for test_fn in [test_info, test_text, test_words, test_search,
test_objects, test_links, test_table, test_svg, test_multipage]:
try:
test_fn(pdf)
except Exception as e:
record(pdf, test_fn.__name__.replace("test_", ""), FAIL, 0, 0, str(e))
print("\n")
print("=" * 90)
print(f" {'PDF':<22} {'Command':<20} {'Status':>6} {'Rust':>10} {'Python':>10} {'Speedup':>8} Detail")
print("-" * 90)
pass_count = fail_count = skip_count = 0
total_rust = total_py = 0
for r in results:
s = r["status"]
if s == PASS:
pass_count += 1
elif s == FAIL:
fail_count += 1
else:
skip_count += 1
if r["rust_time"] > 0:
total_rust += r["rust_time"]
if r["py_time"] > 0:
total_py += r["py_time"]
rt_str = fmt_time(r["rust_time"]) if r["rust_time"] > 0 else "-"
pt_str = fmt_time(r["py_time"]) if r["py_time"] > 0 else "-"
sp_str = speedup_str(r["rust_time"], r["py_time"]) if r["rust_time"] > 0 and r["py_time"] > 0 else "-"
detail = r["detail"][:50] if r["detail"] else ""
print(f" {r['pdf']:<22} {r['command']:<20} {s:>6} {rt_str:>10} {pt_str:>10} {sp_str:>8} {detail}")
print("-" * 90)
print(f" TOTAL: {pass_count} PASS, {fail_count} FAIL, {skip_count} SKIP")
print(f" Aggregate time: Rust={fmt_time(total_rust)}, Python={fmt_time(total_py)}, Speedup={speedup_str(total_rust, total_py)}")
print()
out_path = os.path.join(BENCH_DIR, "results_cli.json")
with open(out_path, "w") as f:
json.dump(results, f, indent=2)
print(f" Results written to {out_path}")
if __name__ == "__main__":
main()