import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
data_dir = Path("data/outputs")
base_depth_df = pd.read_csv(data_dir / "bench_results_base_depth.csv")
mate_fix_df = pd.read_csv(data_dir / "bench_results_base_depth_mate_fix.csv")
mate_fix_new_df = pd.read_csv(data_dir / "bench_results_base_depth_mate_fix_new_v_old.csv")
output_dir = Path("outputs")
output_dir.mkdir(exist_ok=True)
def process_data(df):
df = df.copy()
df['tool'] = df['command'].apply(lambda x: x.split()[0])
df['mean_minutes'] = df['mean'] / 60 return df
base_depth_df = process_data(base_depth_df)
mate_fix_df = process_data(mate_fix_df)
mate_fix_new_df = process_data(mate_fix_new_df)
perbase_std = base_depth_df[base_depth_df['tool'] == 'perbase']['mean_minutes'].values[0]
sambamba_std = base_depth_df[base_depth_df['tool'] == 'sambamba']['mean_minutes'].values[0]
sambamba_mf = mate_fix_new_df[mate_fix_new_df['command'].str.contains('sambamba')]['mean_minutes'].values[0]
mate_fix_methods = []
mate_fix_runtimes = []
for _, row in mate_fix_new_df.iterrows():
if 'perbase' in row['command'] and '30X OLD' not in row['command']:
parts = row['command'].split()
if len(parts) >= 4:
method = ' '.join(parts[3:]) method = method.replace('30X ', '') mate_fix_methods.append(method)
mate_fix_runtimes.append(row['mean_minutes'])
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
best_mate_fix_runtime = min(mate_fix_runtimes)
best_mate_fix_method = mate_fix_methods[np.argmin(mate_fix_runtimes)]
modes = ['Standard', f'Mate-fix\n({best_mate_fix_method})']
x = np.arange(len(modes))
width = 0.35
bars1 = ax1.bar(x - width/2, [perbase_std, best_mate_fix_runtime], width,
label='perbase', color='#1f77b4')
bars2 = ax1.bar(x + width/2, [sambamba_std, sambamba_mf], width,
label='sambamba', color='#ff7f0e')
ax1.set_ylabel('Runtime (minutes)')
ax1.set_xlabel('Mode')
ax1.set_title('perbase vs sambamba: base-depth Performance')
ax1.set_xticks(x)
ax1.set_xticklabels(modes)
ax1.legend()
sorted_indices = np.argsort(mate_fix_runtimes)
sorted_methods = [mate_fix_methods[i] for i in sorted_indices]
sorted_runtimes = [mate_fix_runtimes[i] for i in sorted_indices]
display_methods = []
for method in sorted_methods:
if len(method) > 20:
if 'BaseQualMapQual' in method:
method = method.replace('BaseQualMapQual', 'BQ→MQ→')
elif 'MapQualBaseQual' in method:
method = method.replace('MapQualBaseQual', 'MQ→BQ→')
display_methods.append(method)
else:
display_methods.append(method)
y_pos = np.arange(len(sorted_methods))
bars = ax2.barh(y_pos, sorted_runtimes, color='#2ca02c', alpha=0.7)
ax2.set_yticks(y_pos)
ax2.set_yticklabels(display_methods, fontsize=9)
ax2.set_xlabel('Runtime (minutes)')
ax2.set_title('perbase Mate-fix Method Performance Comparison')
ax2.grid(axis='x', alpha=0.3)
for i, (bar, runtime) in enumerate(zip(bars, sorted_runtimes)):
ax2.text(runtime + 0.5, bar.get_y() + bar.get_height()/2,
f'{runtime:.1f}m', ha='left', va='center', fontsize=8)
ax2.axvline(x=perbase_std, color='#1f77b4', linestyle='--', alpha=0.7,
label=f'Standard mode ({perbase_std:.1f}m)')
ax2.legend()
plt.tight_layout()
plt.savefig(output_dir / 'benchmark_comprehensive.png', dpi=300, bbox_inches='tight')
plt.savefig(output_dir / 'benchmark_comprehensive.pdf', bbox_inches='tight')
fig2, ax = plt.subplots(1, 1, figsize=(8, 6))
x = np.arange(2) width = 0.35
bars1 = ax.bar(x - width/2, [perbase_std, best_mate_fix_runtime], width,
label='perbase', color='#1f77b4')
bars2 = ax.bar(x + width/2, [sambamba_std, sambamba_mf], width,
label='sambamba', color='#ff7f0e')
ax.set_ylabel('Runtime (minutes)')
ax.set_xlabel('Mode')
ax.set_title('perbase vs sambamba: base-depth Performance')
ax.set_xticks(x)
ax.set_xticklabels(['Standard', 'Mate-fix'])
ax.legend()
plt.tight_layout()
plt.savefig(output_dir / 'benchmark_comparison.png', dpi=300, bbox_inches='tight')
plt.savefig(output_dir / 'benchmark_comparison.pdf', bbox_inches='tight')
plt.close()
speedup_std = sambamba_std / perbase_std
speedup_mf = sambamba_mf / best_mate_fix_runtime
print("Plots generated successfully!")
print("\nPerformance Summary:")
print(f"Standard mode:")
print(f" perbase: {perbase_std:.1f} minutes")
print(f" sambamba: {sambamba_std:.1f} minutes")
print(f" Speedup: {speedup_std:.1f}x faster")
print(f"\nMate-fix mode (best method: {best_mate_fix_method}):")
print(f" perbase: {best_mate_fix_runtime:.1f} minutes")
print(f" sambamba: {sambamba_mf:.1f} minutes")
print(f" Speedup: {speedup_mf:.1f}x faster")
print(f"\nAll mate-fix methods performance:")
for method, runtime in zip(sorted_methods, sorted_runtimes):
print(f" {method}: {runtime:.1f} minutes")