import os
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tabulate import tabulate
BENCHMARK_DIR = "benchmark_data"
RESULTS_FILE = os.path.join(BENCHMARK_DIR, "benchmark_results.csv")
OUTPUT_DIR = os.path.join(BENCHMARK_DIR, "reports")
def ensure_dirs():
os.makedirs(OUTPUT_DIR, exist_ok=True)
def load_data():
if not os.path.exists(RESULTS_FILE):
print(f"Error: Results file {RESULTS_FILE} not found")
sys.exit(1)
df = pd.read_csv(RESULTS_FILE)
df['size_name'] = df['log'].apply(lambda x: x.split('_')[1].split('.')[0])
size_mapping = {'10k': 10_000, '100k': 100_000, '1m': 1_000_000}
df['size_numeric'] = df['size_name'].map(size_mapping)
df = df.sort_values(['tool', 'size_numeric'])
return df
def generate_time_comparison(df):
pivot_df = df.pivot(index='size_name', columns='tool', values='time_seconds')
size_order = ['10k', '100k', '1m']
pivot_df = pivot_df.reindex(size_order)
fig, ax = plt.subplots(figsize=(12, 8))
pivot_df.plot(kind='bar', ax=ax)
ax.set_title('Processing Time by Tool and File Size', fontsize=16)
ax.set_xlabel('File Size', fontsize=14)
ax.set_ylabel('Time (seconds)', fontsize=14)
ax.set_yscale('log') ax.legend(title='Tool', fontsize=12)
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
for container in ax.containers:
ax.bar_label(container, fmt='%.2f', fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'time_comparison.png'), dpi=300)
plt.close()
def generate_memory_comparison(df):
pivot_df = df.pivot(index='size_name', columns='tool', values='memory_mb')
size_order = ['10k', '100k', '1m']
pivot_df = pivot_df.reindex(size_order)
fig, ax = plt.subplots(figsize=(12, 8))
pivot_df.plot(kind='bar', ax=ax)
ax.set_title('Memory Usage by Tool and File Size', fontsize=16)
ax.set_xlabel('File Size', fontsize=14)
ax.set_ylabel('Memory (MB)', fontsize=14)
ax.set_yscale('log') ax.legend(title='Tool', fontsize=12)
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
for container in ax.containers:
ax.bar_label(container, fmt='%.1f', fontsize=8)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'memory_comparison.png'), dpi=300)
plt.close()
def generate_scaling_analysis(df):
fig, ax = plt.subplots(figsize=(12, 8))
tools = df['tool'].unique()
markers = ['o', 's', '^', 'D', 'v', '<', '>', 'p', '*', 'h', 'H', '+', 'x', 'd', '|']
for i, tool in enumerate(tools):
tool_data = df[df['tool'] == tool]
ax.plot(tool_data['size_numeric'], tool_data['time_seconds'],
marker=markers[i % len(markers)], label=tool, linewidth=2, markersize=8)
ax.set_title('Performance Scaling by File Size', fontsize=16)
ax.set_xlabel('Number of Lines', fontsize=14)
ax.set_ylabel('Processing Time (seconds)', fontsize=14)
ax.set_xscale('log')
ax.set_yscale('log')
ax.grid(True, which='both', linestyle='--', linewidth=0.5)
ax.legend(title='Tool', fontsize=12)
size_labels = {10_000: '10k', 100_000: '100k', 1_000_000: '1m'}
ax.set_xticks(list(size_labels.keys()))
ax.set_xticklabels(list(size_labels.values()))
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'scaling_analysis.png'), dpi=300)
plt.close()
scaling_data = []
for tool in tools:
tool_data = df[df['tool'] == tool].sort_values('size_numeric')
if len(tool_data) >= 2:
for i in range(1, len(tool_data)):
size_ratio = tool_data.iloc[i]['size_numeric'] / tool_data.iloc[i-1]['size_numeric']
time_ratio = tool_data.iloc[i]['time_seconds'] / tool_data.iloc[i-1]['time_seconds']
scaling_factor = time_ratio / size_ratio
scaling_data.append({
'Tool': tool,
'Size Change': f"{tool_data.iloc[i-1]['size_name']} → {tool_data.iloc[i]['size_name']}",
'Time Increase': f"{time_ratio:.2f}x",
'Scaling Factor': f"{scaling_factor:.3f}"
})
scaling_df = pd.DataFrame(scaling_data)
scaling_df.to_csv(os.path.join(OUTPUT_DIR, 'scaling_factors.csv'), index=False)
print("\nScaling Analysis (how processing time increases with file size):")
print(tabulate(scaling_df, headers='keys', tablefmt='grid'))
def generate_rankings(df):
size_names = df['size_name'].unique()
for size in size_names:
size_df = df[df['size_name'] == size].copy()
size_df['time_rank'] = size_df['time_seconds'].rank()
size_df['memory_rank'] = size_df['memory_mb'].rank()
size_df['combined_score'] = (size_df['time_rank'] + size_df['memory_rank']) / 2
size_df = size_df.sort_values('combined_score')
result_df = size_df[['tool', 'time_seconds', 'memory_mb', 'time_rank', 'memory_rank', 'combined_score']]
result_df.columns = ['Tool', 'Time (s)', 'Memory (MB)', 'Time Rank', 'Memory Rank', 'Combined Score']
result_df['Time (s)'] = result_df['Time (s)'].map('{:.3f}'.format)
result_df['Memory (MB)'] = result_df['Memory (MB)'].map('{:.2f}'.format)
result_df.to_csv(os.path.join(OUTPUT_DIR, f'ranking_{size}.csv'), index=False)
print(f"\nTool Rankings for {size} lines:")
print(tabulate(result_df, headers='keys', tablefmt='grid'))
def generate_timber_specific_analysis(df):
timber_data = df[df['tool'] == 'timber'].copy()
if len(timber_data) == 0:
print("No data for Timber found in the results")
return
timber_data['lines_per_second'] = timber_data['size_numeric'] / timber_data['time_seconds']
timber_data['lines_per_mb'] = timber_data['size_numeric'] / timber_data['memory_mb']
timber_analysis = timber_data[['size_name', 'time_seconds', 'memory_mb', 'lines_per_second', 'lines_per_mb']]
timber_analysis.columns = ['File Size', 'Time (s)', 'Memory (MB)', 'Lines/Second', 'Lines/MB']
timber_analysis['Time (s)'] = timber_analysis['Time (s)'].map('{:.3f}'.format)
timber_analysis['Memory (MB)'] = timber_analysis['Memory (MB)'].map('{:.2f}'.format)
timber_analysis['Lines/Second'] = timber_analysis['Lines/Second'].map('{:.0f}'.format)
timber_analysis['Lines/MB'] = timber_analysis['Lines/MB'].map('{:.0f}'.format)
timber_analysis.to_csv(os.path.join(OUTPUT_DIR, 'timber_analysis.csv'), index=False)
print("\nTimber Performance Analysis:")
print(tabulate(timber_analysis, headers='keys', tablefmt='grid'))
def generate_comparative_analysis(df):
size_names = df['size_name'].unique()
comparative_data = []
for size in size_names:
size_df = df[df['size_name'] == size].copy()
timber_time = size_df[size_df['tool'] == 'timber']['time_seconds'].values
timber_memory = size_df[size_df['tool'] == 'timber']['memory_mb'].values
if len(timber_time) == 0:
continue
timber_time = timber_time[0]
timber_memory = timber_memory[0]
for _, row in size_df.iterrows():
if row['tool'] != 'timber':
time_ratio = timber_time / row['time_seconds']
memory_ratio = timber_memory / row['memory_mb']
comparative_data.append({
'File Size': size,
'Tool': row['tool'],
'Time Ratio': time_ratio, 'Memory Ratio': memory_ratio, 'Timber Time (s)': timber_time,
'Tool Time (s)': row['time_seconds'],
'Timber Memory (MB)': timber_memory,
'Tool Memory (MB)': row['memory_mb']
})
if not comparative_data:
print("No comparative data available")
return
comparative_df = pd.DataFrame(comparative_data)
comparative_df['Time Comparison'] = comparative_df.apply(
lambda x: f"Timber is {1/x['Time Ratio']:.2f}x slower" if x['Time Ratio'] < 1
else f"Timber is {x['Time Ratio']:.2f}x faster", axis=1
)
comparative_df['Memory Comparison'] = comparative_df.apply(
lambda x: f"Timber uses {1/x['Memory Ratio']:.2f}x more memory" if x['Memory Ratio'] < 1
else f"Timber uses {x['Memory Ratio']:.2f}x less memory", axis=1
)
comparative_df.to_csv(os.path.join(OUTPUT_DIR, 'comparative_analysis.csv'), index=False)
print("\nComparative Analysis (Timber vs. Other Tools):")
display_df = comparative_df[['File Size', 'Tool', 'Time Comparison', 'Memory Comparison']]
print(tabulate(display_df, headers='keys', tablefmt='grid'))
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
pivot_time = comparative_df.pivot(index='Tool', columns='File Size', values='Time Ratio')
pivot_time.plot(kind='barh', ax=ax1)
ax1.set_title('Speed Comparison (Time Ratio)', fontsize=16)
ax1.set_xlabel('Ratio (>1 means Timber is faster)', fontsize=14)
ax1.axvline(x=1, color='r', linestyle='--')
ax1.grid(True, which='both', linestyle='--', linewidth=0.5)
pivot_memory = comparative_df.pivot(index='Tool', columns='File Size', values='Memory Ratio')
pivot_memory.plot(kind='barh', ax=ax2)
ax2.set_title('Memory Efficiency Comparison (Memory Ratio)', fontsize=16)
ax2.set_xlabel('Ratio (>1 means Timber uses less memory)', fontsize=14)
ax2.axvline(x=1, color='r', linestyle='--')
ax2.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'comparative_analysis.png'), dpi=300)
plt.close()
def main():
print("Analyzing benchmark results...")
ensure_dirs()
df = load_data()
generate_time_comparison(df)
generate_memory_comparison(df)
generate_scaling_analysis(df)
generate_rankings(df)
generate_timber_specific_analysis(df)
generate_comparative_analysis(df)
print(f"\nAnalysis complete. Reports saved to {OUTPUT_DIR}")
if __name__ == "__main__":
main()