rgwml 2.0.0

Typed, local-first tabular data library with columnar in-memory storage.
Documentation
#!/usr/bin/env python3

import sys
import time


def main() -> int:
    if len(sys.argv) != 4:
        print("usage: bench_pandas_ops.py <base_csv> <left_csv> <right_csv>", file=sys.stderr)
        return 1

    base_csv, left_csv, right_csv = sys.argv[1:4]

    try:
        import pandas as pd
    except ImportError as exc:
        print(f"pandas import failed: {exc}", file=sys.stderr)
        return 2

    base = pd.read_csv(base_csv)
    left = pd.read_csv(left_csv)
    right = pd.read_csv(right_csv)

    start = time.perf_counter()
    filtered = base[
        (base["active"] == True)
        & (base["revenue"] > 5000.0)
        & (base["segment"].str.startswith("segment_0"))
    ]
    filter_ms = int((time.perf_counter() - start) * 1000)

    start = time.perf_counter()
    grouped = (
        base.groupby("segment", sort=False)
        .agg(
            rows=("revenue", "size"),
            revenue_sum=("revenue", "sum"),
            revenue_mean=("revenue", "mean"),
        )
        .reset_index()
    )
    group_ms = int((time.perf_counter() - start) * 1000)

    start = time.perf_counter()
    joined = left.merge(right, how="inner", on="id")
    join_ms = int((time.perf_counter() - start) * 1000)

    print(f"filter_ms={filter_ms}")
    print(f"filter_rows={len(filtered.index)}")
    print(f"group_ms={group_ms}")
    print(f"group_rows={len(grouped.index)}")
    print(f"join_ms={join_ms}")
    print(f"join_rows={len(joined.index)}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())