Skip to main content

rivet/types/
rivet_type.rs

1//! Rivet's canonical internal type system.
2//!
3//! See `rivet_roadmap.md` §Epic 14 (type safety). §5 — Type Mapping Pipeline, §6
4//! ("Internal Type System"). Every database driver maps its native column
5//! metadata into a [`RivetType`] *first*, then a single function maps
6//! [`RivetType`] to `arrow::DataType`. This is the architectural fix for the
7//! status-quo `DB type → Arrow type` shortcut that silently degrades types
8//! to `Utf8` (roadmap §5 "incorrect pipeline").
9//!
10//! Three invariants enforced by this enum:
11//!
12//! 1. **No silent precision loss** — `Decimal { precision, scale }` carries
13//!    the declared precision/scale; constructing one without a known
14//!    precision is impossible by construction. Unbounded numeric columns
15//!    therefore *must* go through [`RivetType::Unsupported`] or be resolved
16//!    by `TypePolicy::decimal.unbounded` (Chunk 4).
17//! 2. **No silent timezone loss** — `Timestamp { unit, timezone }` makes the
18//!    timezone explicit; `timezone: None` means "no timezone semantics" and
19//!    is *not* the same as `Some("UTC")`.
20//! 3. **No silent fallback to string** — anything Rivet can't safely map is
21//!    represented as [`RivetType::Unsupported`] with a reason string, so
22//!    the type-policy layer can decide whether to fail / warn / fallback.
23//!
24//! `serde::Serialize` is implemented so the type-report CLI (Chunk 5) can
25//! emit a stable JSON shape.
26
27use serde::{Deserialize, Serialize};
28
29/// Time-resolution unit for [`RivetType::Time`] / [`RivetType::Timestamp`].
30///
31/// Mirrors `arrow::datatypes::TimeUnit` but lives in the Rivet type system
32/// so we don't leak Arrow as a public API surface and so the type-report
33/// CLI can serialize the value without depending on Arrow's types.
34// Only `Microsecond` is produced by current drivers; the remaining variants
35// are live once ADBC / other drivers are added (roadmap §4).
36#[allow(dead_code)]
37#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
38#[serde(rename_all = "snake_case")]
39pub enum TimeUnit {
40    Second,
41    Millisecond,
42    Microsecond,
43    Nanosecond,
44}
45
46impl TimeUnit {
47    /// Stable lowercase string label for persistence and reports.
48    #[allow(dead_code)]
49    pub fn label(self) -> &'static str {
50        match self {
51            TimeUnit::Second => "second",
52            TimeUnit::Millisecond => "millisecond",
53            TimeUnit::Microsecond => "microsecond",
54            TimeUnit::Nanosecond => "nanosecond",
55        }
56    }
57}
58
59/// Canonical Rivet type. Every source-driver column maps into exactly one
60/// of these variants before we ever look at `arrow::DataType`.
61///
62/// Variants are kept narrow on purpose: adding a new variant is a deliberate
63/// architectural choice (it usually means "we figured out how to safely
64/// export a new shape of data"). Anything outside this enum becomes
65/// [`RivetType::Unsupported`] until the type system gains first-class
66/// support for it.
67// `UInt64` and `Decimal` are live in MySQL/PG mappers once column overrides
68// (Chunk 6) and exact decimal (Milestone 2) land; `Second`/`Millisecond`/
69// `Nanosecond` TimeUnit variants serve future drivers.
70#[allow(dead_code)]
71#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
72#[serde(tag = "kind", rename_all = "snake_case")]
73pub enum RivetType {
74    /// Boolean.
75    Bool,
76
77    /// Signed 16-bit integer (PostgreSQL `int2`, MySQL `smallint`).
78    Int16,
79    /// Signed 32-bit integer (PostgreSQL `int4`, MySQL `int`).
80    Int32,
81    /// Signed 64-bit integer (PostgreSQL `int8`, MySQL `bigint signed`).
82    Int64,
83    /// Unsigned 64-bit integer (MySQL `bigint unsigned`).
84    /// Target compatibility check required (BigQuery has no unsigned 64).
85    UInt64,
86
87    /// IEEE-754 32-bit float. Marked `exact-ish` in the roadmap mappings
88    /// because float→float is bit-exact but float→target may not round-trip.
89    Float32,
90    /// IEEE-754 64-bit float.
91    Float64,
92
93    /// Fixed-precision decimal. Precision/scale are *required* — the whole
94    /// point of having a separate variant from `Float64` is that we never
95    /// route money/decimal through floats. Mapped to Arrow Decimal128 when
96    /// `precision <= 38`, Decimal256 otherwise (roadmap §12).
97    Decimal {
98        /// Total number of significant digits.
99        precision: u8,
100        /// Digits to the right of the decimal point. Signed because
101        /// PostgreSQL `numeric` allows negative scale.
102        scale: i8,
103    },
104
105    /// Calendar date (no time, no timezone).
106    Date,
107
108    /// Time-of-day with the given resolution.
109    Time { unit: TimeUnit },
110
111    /// Timestamp with explicit timezone semantics. `timezone: None` means
112    /// "no timezone" (PostgreSQL `timestamp`, MySQL `datetime`);
113    /// `timezone: Some("UTC")` means timezone-normalized to UTC
114    /// (PostgreSQL `timestamptz`, MySQL `timestamp` with session tz=+00:00).
115    Timestamp {
116        unit: TimeUnit,
117        timezone: Option<String>,
118    },
119
120    /// Variable-length string (PostgreSQL `varchar`, `text`, `bpchar`,
121    /// `name`; MySQL `varchar`, `text`).
122    String,
123    /// Long-form text. Currently treated identically to `String` on the
124    /// Arrow layer (both → `Utf8`), but kept as a separate variant so the
125    /// type-report can distinguish "the source declared this as text" from
126    /// "the source declared this as fixed-length char".
127    Text,
128    /// Variable-length binary (PostgreSQL `bytea`, MySQL `varbinary`/`blob`).
129    Binary,
130
131    /// JSON / JSONB. Stored as `Utf8 + metadata logical=json` until proper
132    /// struct inference is implemented (roadmap §14).
133    Json,
134    /// UUID. Stored as `Utf8 + metadata logical=uuid` by default; can be
135    /// switched to FixedSizeBinary(16) by policy later (roadmap §14).
136    Uuid,
137
138    // ── M6: Complex Types ─────────────────────────────────────────────────
139    /// Database enum type (PostgreSQL `ENUM`, MySQL `ENUM`/`SET`).
140    /// Stored as `Utf8 + metadata logical=enum` (roadmap §15).
141    Enum,
142
143    /// Time interval (PostgreSQL `interval`).
144    /// Stored as Arrow `Utf8` (ISO 8601 duration string, e.g. `"P1Y2M3D"`).
145    /// `Interval(MonthDayNano)` cannot be written to Parquet, so lossless
146    /// text serialisation is used instead (roadmap §15).
147    Interval,
148
149    /// One-dimensional array of a scalar Rivet type.
150    /// PostgreSQL `int8[]`, `text[]`, `bool[]`, etc.
151    /// Stored as Arrow `List(inner_type)` (roadmap §15).
152    List { inner: Box<RivetType> },
153
154    /// The driver knows about the type but Rivet does not have a safe
155    /// mapping for it (e.g. PostgreSQL `geometry`, `hstore`).
156    /// Carries enough context for an actionable error message in the
157    /// type-report and policy layer.
158    Unsupported { native_type: String, reason: String },
159}
160
161impl RivetType {
162    /// Stable lowercase string label for persistence and human-readable
163    /// reports. Round-trippable with the JSON shape of the variant when
164    /// applicable (e.g. `decimal(18,2)`, `timestamp_tz(microsecond,UTC)`).
165    /// Used by the type-report CLI (Chunk 5).
166    #[allow(dead_code)]
167    pub fn label(&self) -> String {
168        match self {
169            RivetType::Bool => "bool".into(),
170            RivetType::Int16 => "int16".into(),
171            RivetType::Int32 => "int32".into(),
172            RivetType::Int64 => "int64".into(),
173            RivetType::UInt64 => "uint64".into(),
174            RivetType::Float32 => "float32".into(),
175            RivetType::Float64 => "float64".into(),
176            RivetType::Decimal { precision, scale } => format!("decimal({precision},{scale})"),
177            RivetType::Date => "date".into(),
178            RivetType::Time { unit } => format!("time({})", unit.label()),
179            RivetType::Timestamp {
180                unit,
181                timezone: None,
182            } => format!("timestamp({})", unit.label()),
183            RivetType::Timestamp {
184                unit,
185                timezone: Some(tz),
186            } => format!("timestamp_tz({},{tz})", unit.label()),
187            RivetType::String => "string".into(),
188            RivetType::Text => "text".into(),
189            RivetType::Binary => "binary".into(),
190            RivetType::Json => "json".into(),
191            RivetType::Uuid => "uuid".into(),
192            RivetType::Enum => "enum".into(),
193            RivetType::Interval => "interval".into(),
194            RivetType::List { inner } => format!("list<{}>", inner.label()),
195            RivetType::Unsupported { native_type, .. } => format!("unsupported({native_type})"),
196        }
197    }
198
199    /// True for the `Unsupported` variant — convenience for the strict-mode
200    /// gate so callers don't have to `matches!()` everywhere.
201    #[allow(dead_code)]
202    pub fn is_unsupported(&self) -> bool {
203        match self {
204            RivetType::Unsupported { .. } => true,
205            // A list of an unsupported element is itself unsupported — the run
206            // can't build the field. Keep consistent with `derive_fidelity`.
207            RivetType::List { inner } => inner.is_unsupported(),
208            _ => false,
209        }
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn label_includes_decimal_precision_and_scale() {
219        assert_eq!(
220            RivetType::Decimal {
221                precision: 18,
222                scale: 2,
223            }
224            .label(),
225            "decimal(18,2)"
226        );
227    }
228
229    #[test]
230    fn label_distinguishes_timestamp_with_and_without_timezone() {
231        let naive = RivetType::Timestamp {
232            unit: TimeUnit::Microsecond,
233            timezone: None,
234        };
235        let tz = RivetType::Timestamp {
236            unit: TimeUnit::Microsecond,
237            timezone: Some("UTC".into()),
238        };
239        assert_eq!(naive.label(), "timestamp(microsecond)");
240        assert_eq!(tz.label(), "timestamp_tz(microsecond,UTC)");
241        assert_ne!(naive, tz, "tz=None and tz=Some(\"UTC\") must NOT be equal");
242    }
243
244    #[test]
245    fn unsupported_carries_actionable_context() {
246        let t = RivetType::Unsupported {
247            native_type: "interval".into(),
248            reason: "Arrow Interval mapping not implemented yet".into(),
249        };
250        assert!(t.is_unsupported());
251        assert_eq!(t.label(), "unsupported(interval)");
252    }
253
254    #[test]
255    fn json_serialization_uses_kind_tag() {
256        let t = RivetType::Decimal {
257            precision: 10,
258            scale: 3,
259        };
260        let json: serde_json::Value =
261            serde_json::from_str(&serde_json::to_string(&t).expect("serialize")).expect("parse");
262        assert_eq!(json["kind"], "decimal");
263        assert_eq!(json["precision"], 10);
264        assert_eq!(json["scale"], 3);
265    }
266
267    #[test]
268    fn time_unit_labels_are_stable() {
269        assert_eq!(TimeUnit::Second.label(), "second");
270        assert_eq!(TimeUnit::Millisecond.label(), "millisecond");
271        assert_eq!(TimeUnit::Microsecond.label(), "microsecond");
272        assert_eq!(TimeUnit::Nanosecond.label(), "nanosecond");
273    }
274}