import resource
import sys
import time
def read_status_bytes(key: str) -> int:
with open("/proc/self/status", "r", encoding="utf-8") as handle:
for line in handle:
if line.startswith(key):
parts = line.split()
if len(parts) >= 2:
return int(parts[1]) * 1024
raise RuntimeError(f"status entry {key!r} was not present")
def main() -> int:
if len(sys.argv) != 2:
print("usage: bench_pandas_memory.py <csv_path>", file=sys.stderr)
return 1
csv_path = sys.argv[1]
try:
import pandas as pd
except ImportError as exc:
print(f"pandas import failed: {exc}", file=sys.stderr)
return 2
baseline_rss = read_status_bytes("VmRSS:")
baseline_peak = max(
read_status_bytes("VmHWM:"),
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024,
)
start = time.perf_counter()
frame = pd.read_csv(csv_path)
elapsed_ms = int((time.perf_counter() - start) * 1000)
rss_bytes = read_status_bytes("VmRSS:")
peak_rss_bytes = max(
read_status_bytes("VmHWM:"),
resource.getrusage(resource.RUSAGE_SELF).ru_maxrss * 1024,
)
payload_bytes = int(frame.memory_usage(deep=True).sum())
print("mode=pandas")
print(f"rows={len(frame.index)}")
print(f"cols={len(frame.columns)}")
print(f"rss_bytes={rss_bytes}")
print(f"rss_delta_bytes={max(0, rss_bytes - baseline_rss)}")
print(f"peak_rss_bytes={peak_rss_bytes}")
print(f"peak_rss_delta_bytes={max(0, peak_rss_bytes - baseline_peak)}")
print(f"elapsed_ms={elapsed_ms}")
print(f"payload_bytes={payload_bytes}")
return 0
if __name__ == "__main__":
raise SystemExit(main())