import os
import sys
import time
import subprocess
import json
import statistics
from pathlib import Path
from tabulate import tabulate
import matplotlib.pyplot as plt
sys.path.insert(0, str(Path(__file__).parent.parent))
def benchmark_arelle(file_path, runs=3):
times = []
for _ in range(runs):
start = time.perf_counter()
result = subprocess.run([
sys.executable, "-c",
f"""
import sys
sys.path.insert(0, 'venv/lib/python{sys.version_info.major}.{sys.version_info.minor}/site-packages')
from arelle import Cntlr
from arelle import ModelManager
# Suppress Arelle output
import logging
logging.getLogger("arelle").setLevel(logging.ERROR)
controller = Cntlr.Cntlr(logFileName=None)
controller.webCache.workOffline = True
modelManager = ModelManager.initialize(controller)
# Load and parse the XBRL file
modelXbrl = modelManager.load('{file_path}')
if modelXbrl:
facts = len(modelXbrl.facts)
contexts = len(modelXbrl.contexts)
units = len(modelXbrl.units)
print(f"{{facts}},{{contexts}},{{units}}")
modelXbrl.close()
"""
], capture_output=True, text=True, cwd=Path(__file__).parent)
end = time.perf_counter()
if result.returncode == 0 and result.stdout:
times.append(end - start)
if len(times) == 1: parts = result.stdout.strip().split(',')
if len(parts) == 3:
print(f" Arelle found: {parts[0]} facts, {parts[1]} contexts, {parts[2]} units")
else:
print(f" Arelle error: {result.stderr}")
if times:
return {
'mean': statistics.mean(times),
'median': statistics.median(times),
'stdev': statistics.stdev(times) if len(times) > 1 else 0,
'min': min(times),
'max': max(times),
'runs': len(times)
}
return None
def benchmark_crabrl(file_path, runs=3):
times = []
subprocess.run(["cargo", "build", "--release", "--example", "benchmark_single"],
capture_output=True, cwd=Path(__file__).parent.parent)
for _ in range(runs):
start = time.perf_counter()
result = subprocess.run([
"../target/release/examples/benchmark_single",
file_path
], capture_output=True, text=True, cwd=Path(__file__).parent)
end = time.perf_counter()
if result.returncode == 0:
times.append(end - start)
if len(times) == 1 and result.stdout: print(f" crabrl output: {result.stdout.strip()}")
else:
print(f" crabrl error: {result.stderr}")
if times:
return {
'mean': statistics.mean(times),
'median': statistics.median(times),
'stdev': statistics.stdev(times) if len(times) > 1 else 0,
'min': min(times),
'max': max(times),
'runs': len(times)
}
return None
def main():
print("=" * 80)
print("XBRL Parser Performance Comparison: crabrl vs Arelle")
print("=" * 80)
test_files = [
("Tiny (10 facts)", "../test_data/test_tiny.xbrl"),
("Small (100 facts)", "../test_data/test_small.xbrl"),
("Medium (1K facts)", "../test_data/test_medium.xbrl"),
("Large (10K facts)", "../test_data/test_large.xbrl"),
("Huge (100K facts)", "../test_data/test_huge.xbrl"),
]
results = []
for name, file_path in test_files:
if not Path(file_path).exists():
print(f"Skipping {name}: file not found")
continue
file_size_mb = Path(file_path).stat().st_size / (1024 * 1024)
print(f"\nBenchmarking {name} ({file_size_mb:.2f} MB)...")
print(" Running Arelle...")
arelle_stats = benchmark_arelle(file_path, runs=5)
print(" Running crabrl...")
crabrl_stats = benchmark_crabrl(file_path, runs=5)
if arelle_stats and crabrl_stats:
speedup = arelle_stats['median'] / crabrl_stats['median']
results.append({
'File': name,
'Size (MB)': f"{file_size_mb:.2f}",
'Arelle (ms)': f"{arelle_stats['median']*1000:.1f}",
'crabrl (ms)': f"{crabrl_stats['median']*1000:.1f}",
'Speedup': f"{speedup:.1f}x",
'arelle_raw': arelle_stats['median'],
'crabrl_raw': crabrl_stats['median'],
})
print("\n" + "=" * 80)
print("RESULTS SUMMARY")
print("=" * 80)
if results:
table_data = [{k: v for k, v in r.items() if not k.endswith('_raw')} for r in results]
print(tabulate(table_data, headers="keys", tablefmt="grid"))
speedups = [r['arelle_raw'] / r['crabrl_raw'] for r in results]
avg_speedup = statistics.mean(speedups)
print(f"\nAverage speedup: {avg_speedup:.1f}x faster than Arelle")
create_performance_chart(results)
else:
print("No results to display")
def create_performance_chart(results):
labels = [r['File'].split('(')[0].strip() for r in results]
arelle_times = [r['arelle_raw'] * 1000 for r in results]
crabrl_times = [r['crabrl_raw'] * 1000 for r in results]
x = range(len(labels))
width = 0.35
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
ax1.bar([i - width/2 for i in x], arelle_times, width, label='Arelle', color='#FF6B6B')
ax1.bar([i + width/2 for i in x], crabrl_times, width, label='crabrl', color='#4ECDC4')
ax1.set_xlabel('File Size')
ax1.set_ylabel('Time (ms)')
ax1.set_title('Parsing Time Comparison')
ax1.set_xticks(x)
ax1.set_xticklabels(labels, rotation=45)
ax1.legend()
ax1.grid(True, alpha=0.3)
speedups = [a/c for a, c in zip(arelle_times, crabrl_times)]
ax2.bar(x, speedups, color='#95E77E')
ax2.set_xlabel('File Size')
ax2.set_ylabel('Speedup Factor')
ax2.set_title('crabrl Speedup over Arelle')
ax2.set_xticks(x)
ax2.set_xticklabels(labels, rotation=45)
ax2.grid(True, alpha=0.3)
for i, v in enumerate(speedups):
ax2.text(i, v + 0.5, f'{v:.1f}x', ha='center', va='bottom')
plt.tight_layout()
plt.savefig('benchmark_results.png', dpi=150)
print(f"\nPerformance chart saved to: benchmarks/benchmark_results.png")
if __name__ == "__main__":
main()