import sys
import time
def main() -> int:
if len(sys.argv) != 4:
print("usage: bench_pandas_ops.py <base_csv> <left_csv> <right_csv>", file=sys.stderr)
return 1
base_csv, left_csv, right_csv = sys.argv[1:4]
try:
import pandas as pd
except ImportError as exc:
print(f"pandas import failed: {exc}", file=sys.stderr)
return 2
base = pd.read_csv(base_csv)
left = pd.read_csv(left_csv)
right = pd.read_csv(right_csv)
start = time.perf_counter()
filtered = base[
(base["active"] == True)
& (base["revenue"] > 5000.0)
& (base["segment"].str.startswith("segment_0"))
]
filter_ms = int((time.perf_counter() - start) * 1000)
start = time.perf_counter()
grouped = (
base.groupby("segment", sort=False)
.agg(
rows=("revenue", "size"),
revenue_sum=("revenue", "sum"),
revenue_mean=("revenue", "mean"),
)
.reset_index()
)
group_ms = int((time.perf_counter() - start) * 1000)
start = time.perf_counter()
joined = left.merge(right, how="inner", on="id")
join_ms = int((time.perf_counter() - start) * 1000)
print(f"filter_ms={filter_ms}")
print(f"filter_rows={len(filtered.index)}")
print(f"group_ms={group_ms}")
print(f"group_rows={len(grouped.index)}")
print(f"join_ms={join_ms}")
print(f"join_rows={len(joined.index)}")
return 0
if __name__ == "__main__":
raise SystemExit(main())