rgwml 2.0.0

Typed, local-first tabular data library with columnar in-memory storage.
Documentation
#!/usr/bin/env python3

import resource
import sys
import time


def read_status_bytes(key: str) -> int:
    with open("/proc/self/status", "r", encoding="utf-8") as handle:
        for line in handle:
            if line.startswith(key):
                parts = line.split()
                if len(parts) >= 2:
                    return int(parts[1]) * 1024
    raise RuntimeError(f"status entry {key!r} was not present")


def main() -> int:
    if len(sys.argv) != 2:
        print("usage: bench_pandas_memory.py <csv_path>", file=sys.stderr)
        return 1

    csv_path = sys.argv[1]

    try:
        import pandas as pd
    except ImportError as exc:
        print(f"pandas import failed: {exc}", file=sys.stderr)
        return 2

    baseline_rss = read_status_bytes("VmRSS:")
    baseline_peak = max(
        read_status_bytes("VmHWM:"),
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024,
    )
    start = time.perf_counter()
    frame = pd.read_csv(csv_path)
    elapsed_ms = int((time.perf_counter() - start) * 1000)
    rss_bytes = read_status_bytes("VmRSS:")
    peak_rss_bytes = max(
        read_status_bytes("VmHWM:"),
        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024,
    )
    payload_bytes = int(frame.memory_usage(deep=True).sum())

    print("mode=pandas")
    print(f"rows={len(frame.index)}")
    print(f"cols={len(frame.columns)}")
    print(f"rss_bytes={rss_bytes}")
    print(f"rss_delta_bytes={max(0, rss_bytes - baseline_rss)}")
    print(f"peak_rss_bytes={peak_rss_bytes}")
    print(f"peak_rss_delta_bytes={max(0, peak_rss_bytes - baseline_peak)}")
    print(f"elapsed_ms={elapsed_ms}")
    print(f"payload_bytes={payload_bytes}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())