1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Bench: default source.tuning (profile + batch_size); optional per-export `tuning:` overrides.
# Most exports share source tuning; differences isolate duration / peak RSS / DB connections.
# Example: `bench_content_p4_balanced` sets `tuning.profile: balanced` to compare vs fast in one file.
#
# Run all exports at once:
# --parallel-exports → threads (faster spawn; peak RSS is shared across jobs in one process)
# --parallel-export-processes → one OS process per export (meaningful peak RSS per summary line)
# YAML: `parallel_exports: true` / `parallel_export_processes: true` (same rules; ignored with `--export`).
#
# rivet run --config dev/bench_chunked_p4.yaml --export bench_content_p4
# rivet run --config dev/bench_chunked_p4.yaml --export bench_content_p4_serial
# rivet run --config dev/bench_chunked_p4.yaml --export bench_content_p4_fatchunk
# rivet run --config dev/bench_chunked_p4.yaml --export bench_content_p4_no_meta
#
# Expected: serial → fewer connections, often lower peak RSS, slower wall time.
# fatchunk → fewer chunk round-trips (4 files vs 20 for 200k rows).
# no_meta → less CPU/RSS (no row_hash / exported_at per row).
source:
type: postgres
url: "postgresql://rivet:rivet@localhost:5432/rivet"
tuning:
profile: fast
batch_size: 1000
exports:
- name: bench_content_p4
query: "SELECT id, title, body, raw_html, tags, author_name, category, status, created_at FROM content_items"
mode: chunked
chunk_column: id
chunk_size: 10000
chunk_checkpoint: true
parallel: 10
format: parquet
meta_columns:
exported_at: true
row_hash: true
destination:
type: local
path: ./dev/output/bench
# Same query + checkpoint; parallel: 1 — compare peak RSS / duration vs bench_content_p4
- name: bench_content_p4_serial
query: "SELECT id, title, body, raw_html, tags, author_name, category, status, created_at FROM content_items"
mode: chunked
chunk_column: id
chunk_size: 10000
chunk_checkpoint: true
parallel: 1
format: parquet
meta_columns:
exported_at: true
row_hash: true
destination:
type: local
path: ./dev/output/bench
# Fewer, larger BETWEEN windows (4 chunks for ~200k dense ids vs 20)
- name: bench_content_p4_fatchunk
query: "SELECT id, title, body, raw_html, tags, author_name, category, status, created_at FROM content_items"
mode: chunked
chunk_column: id
chunk_size: 50000
chunk_checkpoint: true
parallel: 10
format: parquet
meta_columns:
exported_at: true
row_hash: true
destination:
type: local
path: ./dev/output/bench
# No enrichment — isolates xxHash + timestamp column cost
- name: bench_content_p4_no_meta
query: "SELECT id, title, body, raw_html, tags, author_name, category, status, created_at FROM content_items"
mode: chunked
chunk_column: id
chunk_size: 10000
chunk_checkpoint: true
parallel: 10
format: parquet
meta_columns:
exported_at: false
row_hash: false
destination:
type: local
path: ./dev/output/bench
# Same as bench_content_p4 but balanced profile (source stays fast + batch 1000 for other exports)
- name: bench_content_p4_balanced
query: "SELECT id, title, body, raw_html, tags, author_name, category, status, created_at FROM content_items"
mode: chunked
chunk_column: id
chunk_size: 10000
chunk_checkpoint: true
parallel: 10
format: parquet
tuning:
profile: balanced
meta_columns:
exported_at: true
row_hash: true
destination:
type: local
path: ./dev/output/bench