import csv
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter, LogLocator
import numpy as np
def format_throughput(value, pos):
if value >= 1e9:
return f"{value / 1e9:.1f} G"
elif value >= 1e6:
return f"{value / 1e6:.0f} M"
elif value >= 1e3:
return f"{value / 1e3:.0f} K"
else:
return f"{value:.0f}"
def parse_csv_results(csv_file: Path) -> dict:
results = defaultdict(lambda: defaultdict(dict))
with open(csv_file, "r") as f:
reader = csv.DictReader(f)
for row in reader:
operation = row["operation"]
method = row["method"]
seq_length = int(row["seq_length"])
results[operation][method][seq_length] = {
"low": float(row["time_low_ns"]),
"median": float(row["time_median_ns"]),
"high": float(row["time_high_ns"]),
}
return results
def plot_benchmarks(results: dict, output_file: str = "benchmark_plot.png"):
operations = ["encode", "decode", "roundtrip", "reverse_complement"]
method_styles = {
"simd_2bit": {"color": "#2ecc71", "marker": "o", "label": "SIMD 2-bit"},
"simd_4bit": {"color": "#3498db", "marker": "s", "label": "SIMD 4-bit"},
"scalar_2bit": {"color": "#e74c3c", "marker": "^", "label": "Scalar 2-bit"},
"scalar_4bit": {"color": "#f39c12", "marker": "D", "label": "Scalar 4-bit"},
"simd": {"color": "#3498db", "marker": "s", "label": "SIMD (ASCII)"},
"simd_high_level": {"color": "#3498db", "marker": "s", "label": "SIMD (ASCII)"},
"simd_encoded": {"color": "#9b59b6", "marker": "p", "label": "SIMD (Encoded)"},
"scalar": {"color": "#e74c3c", "marker": "^", "label": "Scalar"},
}
fig, axes = plt.subplots(4, 1, figsize=(10, 18))
fig.suptitle(
"DNA Encoding/Decoding Benchmark Results",
fontsize=18,
fontweight="bold",
y=0.98,
)
for ax, operation in zip(axes, operations):
op_data = results.get(operation, {})
if not op_data:
ax.set_title(f"{operation.capitalize()} (no data)", fontsize=14)
continue
for method, style in method_styles.items():
if method not in op_data:
continue
method_data = op_data[method]
seq_lengths = sorted(method_data.keys())
medians = [method_data[sl]["median"] for sl in seq_lengths]
lows = [method_data[sl]["low"] for sl in seq_lengths]
highs = [method_data[sl]["high"] for sl in seq_lengths]
yerr_low = [medians[i] - lows[i] for i in range(len(medians))]
yerr_high = [highs[i] - medians[i] for i in range(len(medians))]
ax.errorbar(
seq_lengths,
medians,
yerr=[yerr_low, yerr_high],
fmt=style["marker"] + "-",
color=style["color"],
label=style["label"],
capsize=4,
capthick=1.5,
markersize=8,
linewidth=2,
alpha=0.85,
)
ax.set_xlabel("Sequence Length (bases)", fontsize=13)
ax.set_ylabel("Time (ns)", fontsize=13)
ax.set_title(
f"{operation.replace('_', ' ').title()}", fontsize=15, fontweight="bold"
)
ax.set_xscale("log", base=10)
ax.set_yscale("log")
ax.tick_params(axis="both", which="major", labelsize=11)
ax.tick_params(axis="both", which="minor", labelsize=9)
ax.grid(True, which="major", axis="y", alpha=0.5, linestyle="-", linewidth=0.8)
ax.grid(True, which="minor", axis="y", alpha=0.2, linestyle="--", linewidth=0.5)
ax.grid(True, which="major", axis="x", alpha=0.3, linestyle="--", linewidth=0.5)
ax.legend(loc="upper left", fontsize=11, framealpha=0.9)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.savefig(output_file, dpi=200, bbox_inches="tight", facecolor="white")
print(f"Plot saved to {output_file}")
plt.show()
def plot_throughput(results: dict, output_file: str = "throughput_plot.png"):
operations = ["encode", "decode", "roundtrip", "reverse_complement"]
titles = {
"encode": "Encode",
"decode": "Decode",
"roundtrip": "Roundtrip (Total)",
"reverse_complement": "Reverse Complement",
}
method_styles = {
"simd_2bit": {"color": "#2ecc71", "marker": "o", "label": "SIMD 2-bit"},
"simd_4bit": {"color": "#3498db", "marker": "s", "label": "SIMD 4-bit"},
"scalar_2bit": {"color": "#e74c3c", "marker": "^", "label": "Scalar 2-bit"},
"scalar_4bit": {"color": "#f39c12", "marker": "D", "label": "Scalar 4-bit"},
"simd": {"color": "#3498db", "marker": "s", "label": "SIMD (ASCII)"},
"simd_high_level": {"color": "#3498db", "marker": "s", "label": "SIMD (ASCII)"},
"simd_encoded": {"color": "#9b59b6", "marker": "p", "label": "SIMD (Encoded)"},
"scalar": {"color": "#e74c3c", "marker": "^", "label": "Scalar"},
}
fig, axes = plt.subplots(4, 1, figsize=(10, 18))
fig.suptitle(
"DNA Encoding/Decoding Throughput", fontsize=18, fontweight="bold", y=0.98
)
for ax, operation in zip(axes, operations):
op_data = results.get(operation, {})
if not op_data:
ax.set_title(
f"{titles.get(operation, operation.capitalize())} (no data)",
fontsize=14,
)
continue
for method, style in method_styles.items():
if method not in op_data:
continue
method_data = op_data[method]
seq_lengths = sorted(method_data.keys())
throughputs = [sl / method_data[sl]["median"] * 1e9 for sl in seq_lengths]
throughput_low = [
sl / method_data[sl]["high"] * 1e9 for sl in seq_lengths
] throughput_high = [sl / method_data[sl]["low"] * 1e9 for sl in seq_lengths]
yerr_low = [
throughputs[i] - throughput_low[i] for i in range(len(throughputs))
]
yerr_high = [
throughput_high[i] - throughputs[i] for i in range(len(throughputs))
]
ax.errorbar(
seq_lengths,
throughputs,
yerr=[yerr_low, yerr_high],
fmt=style["marker"] + "-",
color=style["color"],
label=style["label"],
capsize=4,
capthick=1.5,
markersize=8,
linewidth=2,
alpha=0.85,
)
ax.set_xlabel("Sequence Length (bases)", fontsize=13)
ax.set_ylabel("Throughput (bases/s)", fontsize=13)
ax.set_title(
f"{titles.get(operation, operation.capitalize())} Throughput",
fontsize=15,
fontweight="bold",
)
ax.set_xscale("log", base=10)
ax.set_yscale("log")
if operation == "reverse_complement":
ax.yaxis.set_major_locator(LogLocator(base=10, numticks=10))
ax.yaxis.set_minor_locator(LogLocator(base=10, subs=[2, 5], numticks=10))
ax.yaxis.set_minor_formatter(FuncFormatter(lambda x, p: ""))
else:
ax.yaxis.set_major_locator(LogLocator(base=10, numticks=15))
ax.yaxis.set_minor_locator(
LogLocator(base=10, subs=[2, 3, 4, 5, 6, 7, 8, 9], numticks=15)
)
ax.yaxis.set_minor_formatter(FuncFormatter(format_throughput))
ax.yaxis.set_major_formatter(FuncFormatter(format_throughput))
ax.tick_params(axis="both", which="major", labelsize=10)
ax.tick_params(axis="both", which="minor", labelsize=7)
ax.grid(True, which="major", axis="y", alpha=0.5, linestyle="-", linewidth=0.8)
ax.grid(True, which="minor", axis="y", alpha=0.2, linestyle="--", linewidth=0.5)
ax.grid(True, which="major", axis="x", alpha=0.3, linestyle="--", linewidth=0.5)
ax.legend(loc="lower right", fontsize=11, framealpha=0.9)
if operation == "roundtrip":
ax.set_ylim(top=4.0e9)
plt.tight_layout(rect=[0, 0, 1, 0.97])
plt.savefig(output_file, dpi=200, bbox_inches="tight", facecolor="white")
print(f"Throughput plot saved to {output_file}")
plt.show()
def print_summary(results: dict):
print("\n" + "=" * 80)
print("BENCHMARK SUMMARY")
print("=" * 80)
for operation in ["encode", "decode", "roundtrip", "reverse_complement"]:
op_data = results.get(operation, {})
if not op_data:
continue
print(f"\n{operation.upper()}")
print("-" * 60)
all_lengths = set()
for method_data in op_data.values():
all_lengths.update(method_data.keys())
for seq_len in sorted(all_lengths):
print(f"\n Sequence Length: {seq_len}")
for method in sorted(op_data.keys()):
if seq_len in op_data[method]:
data = op_data[method][seq_len]
print(
f" {method:15s}: {data['median']:10.2f} ns "
f"[{data['low']:10.2f} - {data['high']:10.2f}]"
)
def main():
script_dir = Path(__file__).parent
artefacts_dir = script_dir.parent / "artefacts"
csv_file = artefacts_dir / "benchmark_data_optimised.csv"
if not csv_file.exists():
print(f"Error: CSV file not found at {csv_file}")
print("Please run convert_benchmark_to_csv.py first")
return 1
print(f"Parsing benchmark results from {csv_file}")
results = parse_csv_results(csv_file)
if not results:
print("No benchmark results found!")
return 1
print_summary(results)
plot_benchmarks(results, str(artefacts_dir / "benchmark_plot.png"))
plot_throughput(results, str(artefacts_dir / "throughput_plot.png"))
return 0
if __name__ == "__main__":
exit(main())