crabrl 0.1.0

High-performance XBRL parser and validator
Documentation
#!/usr/bin/env python3
"""Compare performance between crabrl and Arelle."""

import os
import sys
import time
import subprocess
import json
import statistics
from pathlib import Path
from tabulate import tabulate
import matplotlib.pyplot as plt

# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))

def benchmark_arelle(file_path, runs=3):
    """Benchmark Arelle parsing performance."""
    times = []
    
    for _ in range(runs):
        start = time.perf_counter()
        
        # Run Arelle in subprocess to isolate memory
        result = subprocess.run([
            sys.executable, "-c",
            f"""
import sys
sys.path.insert(0, 'venv/lib/python{sys.version_info.major}.{sys.version_info.minor}/site-packages')
from arelle import Cntlr
from arelle import ModelManager

# Suppress Arelle output
import logging
logging.getLogger("arelle").setLevel(logging.ERROR)

controller = Cntlr.Cntlr(logFileName=None)
controller.webCache.workOffline = True
modelManager = ModelManager.initialize(controller)

# Load and parse the XBRL file
modelXbrl = modelManager.load('{file_path}')
if modelXbrl:
    facts = len(modelXbrl.facts)
    contexts = len(modelXbrl.contexts)
    units = len(modelXbrl.units)
    print(f"{{facts}},{{contexts}},{{units}}")
    modelXbrl.close()
"""
        ], capture_output=True, text=True, cwd=Path(__file__).parent)
        
        end = time.perf_counter()
        
        if result.returncode == 0 and result.stdout:
            times.append(end - start)
            if len(times) == 1:  # Print counts on first run
                parts = result.stdout.strip().split(',')
                if len(parts) == 3:
                    print(f"    Arelle found: {parts[0]} facts, {parts[1]} contexts, {parts[2]} units")
        else:
            print(f"    Arelle error: {result.stderr}")
    
    if times:
        return {
            'mean': statistics.mean(times),
            'median': statistics.median(times),
            'stdev': statistics.stdev(times) if len(times) > 1 else 0,
            'min': min(times),
            'max': max(times),
            'runs': len(times)
        }
    return None

def benchmark_crabrl(file_path, runs=3):
    """Benchmark crabrl parsing performance."""
    times = []
    
    # Build the benchmark binary if needed
    subprocess.run(["cargo", "build", "--release", "--example", "benchmark_single"], 
                  capture_output=True, cwd=Path(__file__).parent.parent)
    
    for _ in range(runs):
        start = time.perf_counter()
        
        result = subprocess.run([
            "../target/release/examples/benchmark_single",
            file_path
        ], capture_output=True, text=True, cwd=Path(__file__).parent)
        
        end = time.perf_counter()
        
        if result.returncode == 0:
            times.append(end - start)
            if len(times) == 1 and result.stdout:  # Print counts on first run
                print(f"    crabrl output: {result.stdout.strip()}")
        else:
            print(f"    crabrl error: {result.stderr}")
    
    if times:
        return {
            'mean': statistics.mean(times),
            'median': statistics.median(times),
            'stdev': statistics.stdev(times) if len(times) > 1 else 0,
            'min': min(times),
            'max': max(times),
            'runs': len(times)
        }
    return None

def main():
    """Run comparative benchmarks."""
    print("=" * 80)
    print("XBRL Parser Performance Comparison: crabrl vs Arelle")
    print("=" * 80)
    
    test_files = [
        ("Tiny (10 facts)", "../test_data/test_tiny.xbrl"),
        ("Small (100 facts)", "../test_data/test_small.xbrl"),
        ("Medium (1K facts)", "../test_data/test_medium.xbrl"),
        ("Large (10K facts)", "../test_data/test_large.xbrl"),
        ("Huge (100K facts)", "../test_data/test_huge.xbrl"),
    ]
    
    results = []
    
    for name, file_path in test_files:
        if not Path(file_path).exists():
            print(f"Skipping {name}: file not found")
            continue
        
        file_size_mb = Path(file_path).stat().st_size / (1024 * 1024)
        print(f"\nBenchmarking {name} ({file_size_mb:.2f} MB)...")
        
        # Benchmark Arelle
        print("  Running Arelle...")
        arelle_stats = benchmark_arelle(file_path, runs=5)
        
        # Benchmark crabrl
        print("  Running crabrl...")
        crabrl_stats = benchmark_crabrl(file_path, runs=5)
        
        if arelle_stats and crabrl_stats:
            speedup = arelle_stats['median'] / crabrl_stats['median']
            results.append({
                'File': name,
                'Size (MB)': f"{file_size_mb:.2f}",
                'Arelle (ms)': f"{arelle_stats['median']*1000:.1f}",
                'crabrl (ms)': f"{crabrl_stats['median']*1000:.1f}",
                'Speedup': f"{speedup:.1f}x",
                'arelle_raw': arelle_stats['median'],
                'crabrl_raw': crabrl_stats['median'],
            })
    
    # Print results table
    print("\n" + "=" * 80)
    print("RESULTS SUMMARY")
    print("=" * 80)
    
    if results:
        table_data = [{k: v for k, v in r.items() if not k.endswith('_raw')} for r in results]
        print(tabulate(table_data, headers="keys", tablefmt="grid"))
        
        # Calculate average speedup
        speedups = [r['arelle_raw'] / r['crabrl_raw'] for r in results]
        avg_speedup = statistics.mean(speedups)
        print(f"\nAverage speedup: {avg_speedup:.1f}x faster than Arelle")
        
        # Create performance chart
        create_performance_chart(results)
    else:
        print("No results to display")

def create_performance_chart(results):
    """Create a performance comparison chart."""
    labels = [r['File'].split('(')[0].strip() for r in results]
    arelle_times = [r['arelle_raw'] * 1000 for r in results]
    crabrl_times = [r['crabrl_raw'] * 1000 for r in results]
    
    x = range(len(labels))
    width = 0.35
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Bar chart
    ax1.bar([i - width/2 for i in x], arelle_times, width, label='Arelle', color='#FF6B6B')
    ax1.bar([i + width/2 for i in x], crabrl_times, width, label='crabrl', color='#4ECDC4')
    ax1.set_xlabel('File Size')
    ax1.set_ylabel('Time (ms)')
    ax1.set_title('Parsing Time Comparison')
    ax1.set_xticks(x)
    ax1.set_xticklabels(labels, rotation=45)
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Speedup chart
    speedups = [a/c for a, c in zip(arelle_times, crabrl_times)]
    ax2.bar(x, speedups, color='#95E77E')
    ax2.set_xlabel('File Size')
    ax2.set_ylabel('Speedup Factor')
    ax2.set_title('crabrl Speedup over Arelle')
    ax2.set_xticks(x)
    ax2.set_xticklabels(labels, rotation=45)
    ax2.grid(True, alpha=0.3)
    
    # Add value labels on bars
    for i, v in enumerate(speedups):
        ax2.text(i, v + 0.5, f'{v:.1f}x', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('benchmark_results.png', dpi=150)
    print(f"\nPerformance chart saved to: benchmarks/benchmark_results.png")

if __name__ == "__main__":
    main()