1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# Rivet Example: PostgreSQL → Date-Chunked Export → Local
#
# Use case: initial load of a large orders/events table where the natural
# partition boundary is time, not a numeric ID.
# chunk_by_days splits the table into calendar windows
# (e.g. one chunk per year) using >= / < date comparisons, which
# works correctly for TIMESTAMP columns and avoids the unix-epoch
# arithmetic problem common in JDBC-based tools.
#
# When to prefer date chunking over numeric chunking:
# - Your table has no dense numeric PK, or IDs are UUIDs
# - You want even partitions by time rather than by row count
# - The source DB performs better on date-range predicates than BETWEEN int
#
# Run: rivet run -c examples/pg_date_chunked_local.yaml --validate
source:
type: postgres
url_env: DATABASE_URL
tuning:
profile: balanced
batch_size: 10000
exports:
- name: orders_by_year
query: >
SELECT id, user_id, product, quantity, price, status, ordered_at
FROM orders
mode: chunked
# chunk_column must be a DATE or TIMESTAMP column when chunk_by_days is set.
# Rivet queries MIN/MAX as text, parses the dates, then generates
# non-overlapping windows: ordered_at >= 'YYYY-MM-DD' AND ordered_at < 'YYYY-MM-DD'
chunk_column: ordered_at
chunk_by_days: 365 # one chunk per ~year
format: parquet
compression: zstd
# Postgres exposes numeric(precision, scale) via the catalog, but only
# for plain `SELECT ... FROM <table>` queries. In chunked mode Rivet
# wraps the user query and the catalog lookup loses the qualifier, so
# `price` (numeric) must be declared explicitly here. The simpler
# `pg_incremental_local.yaml` does not need this because Rivet reads
# precision directly from the un-wrapped query metadata.
columns:
price: decimal(10,2)
destination:
type: local
path: ./output/orders
- name: events_by_month
query: >
SELECT id, user_id, event_type, ip_address, created_at
FROM events
mode: chunked
chunk_column: created_at
chunk_by_days: 30 # one chunk per ~month
parallel: 4 # process 4 months concurrently
# Enable checkpoint to resume if the export is interrupted mid-run
chunk_checkpoint: true
chunk_max_attempts: 3
format: parquet
compression: zstd
destination:
type: local
path: ./output/events
tuning:
batch_size: 5000 # smaller batches for wide rows