from __future__ import annotations
import argparse
import gc
import json
import sys
import time
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Callable
sys.path.insert(0, str(Path(__file__).parent))
from common.data import load_hkcancor, wordseg_data
@dataclass
class BenchmarkResult:
name: str
implementation: str
num_sentences: int
time_seconds: float
iterations: int
sentences_per_second: float = field(init=False)
def __post_init__(self) -> None:
total_sentences = self.num_sentences * self.iterations
self.sentences_per_second = total_sentences / self.time_seconds
def time_function(
func: Callable[[], Any],
iterations: int = 5,
warmup: int = 1,
) -> float:
for _ in range(warmup):
func()
gc.collect()
times = []
for _ in range(iterations):
start = time.perf_counter()
func()
times.append(time.perf_counter() - start)
return sum(times)
def benchmark_longest_string_matching(
rustling_cls: type | None,
wordseg_cls: type | None,
training_data: list[tuple[str, ...]],
test_sentences: list[str],
max_word_length: int,
iterations: int = 5,
) -> tuple[BenchmarkResult | None, BenchmarkResult | None]:
results = []
for name, cls in [("rustling", rustling_cls), ("wordseg", wordseg_cls)]:
if cls is None:
results.append(None)
continue
model = cls(max_word_length=max_word_length)
model.fit(training_data)
def predict() -> None:
result = model.predict(test_sentences)
list(result)
total_time = time_function(predict, iterations=iterations)
results.append(
BenchmarkResult(
name="LongestStringMatching",
implementation=name,
num_sentences=len(test_sentences),
time_seconds=total_time,
iterations=iterations,
)
)
return results[0], results[1]
def print_result(result: BenchmarkResult) -> None:
print(f" {result.implementation}:")
print(
f" Total time: {result.time_seconds:.4f}s ({result.iterations} iterations)"
)
print(f" Sentences/second: {result.sentences_per_second:,.0f}")
def print_comparison(
rustling_result: BenchmarkResult, wordseg_result: BenchmarkResult
) -> None:
if rustling_result.time_seconds > 0:
speedup = wordseg_result.time_seconds / rustling_result.time_seconds
else:
speedup = float("inf")
print(f"\n âš¡ Speedup: {speedup:.1f}x faster")
def run_benchmarks(
quick: bool = False,
verbose: bool = True,
) -> dict[str, Any]:
rustling_lsm = None
wordseg_lsm = None
try:
from rustling.wordseg import LongestStringMatching as RustlingLSM
rustling_lsm = RustlingLSM
if verbose:
print("✓ rustling.wordseg loaded successfully")
except ImportError as e:
if verbose:
print(f"✗ rustling.wordseg not available: {e}")
try:
from wordseg import LongestStringMatching as WordsegLSM
wordseg_lsm = WordsegLSM
if verbose:
print("✓ wordseg (pure Python) loaded successfully")
except ImportError as e:
if verbose:
print(f"✗ wordseg (pure Python) not available: {e}")
if rustling_lsm is None and wordseg_lsm is None:
print("\nError: Neither implementation is available.")
sys.exit(1)
if verbose:
print("\nLoading HKCanCor corpus...")
tagged_sents = load_hkcancor()
training_data, test_sentences = wordseg_data(tagged_sents)
if quick:
training_data = training_data[:500]
test_sentences = test_sentences[:100]
iterations = 3
else:
iterations = 5
max_word_length = max(len(word) for sent in training_data for word in sent)
if verbose:
print(f"Training sentences: {len(training_data)}")
print(f"Test sentences: {len(test_sentences)}")
print(f"Max word length: {max_word_length}")
all_results: dict[str, Any] = {"benchmarks": {}}
print("\n" + "=" * 60)
print("WORDSEG BENCHMARK: Rustling (Rust) vs wordseg (Python)")
print("=" * 60)
if verbose:
print("\n📊 LongestStringMatching:")
lsm_rustling, lsm_wordseg = benchmark_longest_string_matching(
rustling_lsm,
wordseg_lsm,
training_data,
test_sentences,
max_word_length=max_word_length,
iterations=iterations,
)
if lsm_rustling and verbose:
print_result(lsm_rustling)
if lsm_wordseg and verbose:
print_result(lsm_wordseg)
if lsm_rustling and lsm_wordseg and verbose:
print_comparison(lsm_rustling, lsm_wordseg)
all_results["benchmarks"]["LongestStringMatching"] = {
"rustling": lsm_rustling.__dict__ if lsm_rustling else None,
"wordseg": lsm_wordseg.__dict__ if lsm_wordseg else None,
}
speedups: dict[str, float] = {}
for algo in ["LongestStringMatching"]:
bench = all_results["benchmarks"].get(algo, {})
r = bench.get("rustling")
w = bench.get("wordseg")
if r and w and r["time_seconds"] > 0:
speedups[algo] = w["time_seconds"] / r["time_seconds"]
all_results["speedups"] = speedups
return all_results
def main() -> None:
parser = argparse.ArgumentParser(
description="Benchmark rustling.wordseg vs pure Python wordseg"
)
parser.add_argument(
"--quick",
action="store_true",
help="Run quick benchmark with smaller data",
)
parser.add_argument(
"--export",
type=str,
metavar="FILE",
help="Export results to JSON file",
)
parser.add_argument(
"--quiet",
action="store_true",
help="Suppress output (useful with --export)",
)
args = parser.parse_args()
results = run_benchmarks(
quick=args.quick,
verbose=not args.quiet,
)
if args.export:
export_path = Path(args.export)
with open(export_path, "w") as f:
json.dump(results, f, indent=2)
print(f"\nResults exported to: {export_path}")
if __name__ == "__main__":
main()