from __future__ import annotations
import argparse
import gc
import json
import subprocess
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable
UD_ENGLISH_EWT_DIR = Path.home() / ".rustling" / "ud-english-ewt"
@dataclass
class BenchmarkResult:
name: str
implementation: str
time_seconds: float
iterations: int
detail: str = ""
ops_per_second: float = field(init=False, default=0.0)
def __post_init__(self) -> None:
if self.time_seconds > 0:
self.ops_per_second = self.iterations / self.time_seconds
def ensure_ud_english_ewt() -> Path:
if not UD_ENGLISH_EWT_DIR.exists():
print("Downloading UD_English-EWT data...")
UD_ENGLISH_EWT_DIR.mkdir(parents=True, exist_ok=True)
subprocess.run(
[
"git",
"clone",
"--depth",
"1",
"https://github.com/UniversalDependencies/UD_English-EWT.git",
str(UD_ENGLISH_EWT_DIR),
],
check=True,
)
return UD_ENGLISH_EWT_DIR
def collect_conllu_files(ud_dir: Path) -> list[Path]:
conllu_files = sorted(ud_dir.glob("*.conllu"))
if not conllu_files:
print("Error: No .conllu files found in UD_English-EWT data.")
sys.exit(1)
return conllu_files
def time_function(
func: Callable[[], Any],
iterations: int = 5,
warmup: int = 1,
) -> float:
for _ in range(warmup):
func()
gc.collect()
times = []
for _ in range(iterations):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return sum(times)
def print_result(result: BenchmarkResult) -> None:
print(f" {result.implementation}:")
print(
f" Total time: {result.time_seconds:.4f}s ({result.iterations} iterations)"
)
if result.detail:
print(f" Detail: {result.detail}")
def print_comparison(
rustling_result: BenchmarkResult, python_result: BenchmarkResult
) -> None:
if rustling_result.time_seconds > 0:
speedup = python_result.time_seconds / rustling_result.time_seconds
else:
speedup = float("inf")
print(f"\n Speedup: {speedup:.1f}x faster")
def run_benchmarks(
quick: bool = False,
verbose: bool = True,
) -> dict[str, Any]:
ud_dir = ensure_ud_english_ewt()
conllu_files = collect_conllu_files(ud_dir)
conllu_paths_str = [str(p) for p in conllu_files]
rustling_conllu = None
conllu_pkg = None
try:
from rustling.conllu import CoNLLU
rustling_conllu = CoNLLU
if verbose:
print("rustling.conllu loaded successfully")
except ImportError as e:
if verbose:
print(f"rustling.conllu not available: {e}")
try:
import conllu as _conllu_pkg
conllu_pkg = _conllu_pkg
if verbose:
print("conllu loaded successfully")
except ImportError as e:
if verbose:
print(f"conllu not available: {e}")
if rustling_conllu is None and conllu_pkg is None:
print("\nError: Neither implementation is available.")
sys.exit(1)
iterations = 3 if quick else 10
all_results: dict[str, Any] = {"benchmarks": {}}
conllu_strs = []
for f in conllu_files:
conllu_strs.append(f.read_text(encoding="utf-8"))
print("\n" + "=" * 60)
print("CONLLU BENCHMARK: Rustling (Rust) vs conllu (Python)")
print(f"Dataset: UD_English-EWT ({len(conllu_files)} .conllu files)")
print("=" * 60)
if verbose:
print(f"\nfrom_strs (parsing {len(conllu_strs)} in-memory strings):")
results = []
for name, impl in [("rustling", rustling_conllu), ("conllu", conllu_pkg)]:
if impl is None:
results.append(None)
continue
if name == "rustling":
def parse_strs() -> None:
impl.from_strs(conllu_strs)
else:
def parse_strs() -> None:
for s in conllu_strs:
impl.parse(s)
total_time = time_function(parse_strs, iterations=iterations)
result = BenchmarkResult(
name="from_strs",
implementation=name,
time_seconds=total_time,
iterations=iterations,
)
results.append(result)
if verbose:
print_result(result)
if results[0] and results[1] and verbose:
print_comparison(results[0], results[1])
all_results["benchmarks"]["from_strs"] = {
"rustling": results[0].__dict__ if results[0] else None,
"conllu": results[1].__dict__ if results[1] else None,
}
if verbose:
print(f"\nfrom_files (parsing {len(conllu_files)} files from disk):")
results = []
for name, impl in [("rustling", rustling_conllu), ("conllu", conllu_pkg)]:
if impl is None:
results.append(None)
continue
if name == "rustling":
def parse_files() -> None:
impl.from_files(conllu_paths_str)
else:
def parse_files() -> None:
for p in conllu_paths_str:
with open(p, encoding="utf-8") as f:
impl.parse(f.read())
total_time = time_function(parse_files, iterations=iterations)
result = BenchmarkResult(
name="from_files",
implementation=name,
time_seconds=total_time,
iterations=iterations,
detail=f"{len(conllu_files)} files",
)
results.append(result)
if verbose:
print_result(result)
if results[0] and results[1] and verbose:
print_comparison(results[0], results[1])
all_results["benchmarks"]["from_files"] = {
"rustling": results[0].__dict__ if results[0] else None,
"conllu": results[1].__dict__ if results[1] else None,
}
speedups: dict[str, float] = {}
for task in ["from_strs", "from_files"]:
bench = all_results["benchmarks"].get(task, {})
r = bench.get("rustling")
p = bench.get("conllu")
if r and p and r["time_seconds"] > 0:
speedups[task] = p["time_seconds"] / r["time_seconds"]
all_results["speedups"] = speedups
return all_results
def main() -> None:
parser = argparse.ArgumentParser(
description="Benchmark rustling.conllu vs conllu for CoNLL-U parsing"
)
parser.add_argument(
"--quick",
action="store_true",
help="Run quick benchmark with fewer iterations",
)
parser.add_argument(
"--export",
type=str,
metavar="FILE",
help="Export results to JSON file",
)
parser.add_argument(
"--quiet",
action="store_true",
help="Suppress output (useful with --export)",
)
args = parser.parse_args()
results = run_benchmarks(
quick=args.quick,
verbose=not args.quiet,
)
if args.export:
export_path = Path(args.export)
with open(export_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults exported to: {export_path}")
if __name__ == "__main__":
main()