vibesql_types/data_type.rs
1//! SQL Data Type definitions
2
3use crate::temporal::IntervalField;
4
5/// SQLite Type Affinity
6///
7/// SQLite uses type affinity to determine how values are compared and stored.
8/// This enum represents the five affinity types defined by SQLite:
9/// - TEXT: String comparison semantics
10/// - NUMERIC: Numeric comparison, converts strings to numbers when possible
11/// - INTEGER: Prefers integer storage
12/// - REAL: Floating-point storage
13/// - BLOB/NONE: No affinity, uses type ordering for cross-type comparisons
14///
15/// See: https://www.sqlite.org/datatype3.html
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum TypeAffinity {
18 /// TEXT affinity - string comparison semantics
19 /// When comparing TEXT vs INTEGER, converts INTEGER to TEXT
20 Text,
21 /// NUMERIC affinity - tries to convert to number first
22 Numeric,
23 /// INTEGER affinity - prefers integer storage
24 Integer,
25 /// REAL affinity - floating-point storage
26 Real,
27 /// BLOB/NONE affinity - no type preference, uses type ordering
28 /// This is used for bare columns with no declared type
29 None,
30}
31
32/// SQL:1999 Data Types
33///
34/// Represents the type of a column or expression in SQL.
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub enum DataType {
37 // Exact numeric types
38 Integer,
39 Smallint,
40 Bigint,
41 Unsigned, // 64-bit unsigned integer (MySQL compatibility)
42 Numeric { precision: u8, scale: u8 },
43 Decimal { precision: u8, scale: u8 },
44
45 // Approximate numeric types
46 Float { precision: u8 }, // SQL:1999 FLOAT(p), default 53 (double precision)
47 Real,
48 DoublePrecision,
49
50 // Character string types
51 Character { length: usize },
52 Varchar { max_length: Option<usize> }, // None = default length (255)
53 CharacterLargeObject, // CLOB
54 Name, /* NAME type for SQL identifiers (SQL:1999), maps to
55 * VARCHAR(128) */
56
57 // Boolean type (SQL:1999)
58 Boolean,
59
60 // Date/time types
61 Date,
62 Time { with_timezone: bool },
63 Timestamp { with_timezone: bool },
64
65 // Interval types
66 // Single field: INTERVAL YEAR, INTERVAL MONTH, etc. (end_field is None)
67 // Multi-field: INTERVAL YEAR TO MONTH, INTERVAL DAY TO SECOND, etc.
68 Interval { start_field: IntervalField, end_field: Option<IntervalField> },
69
70 // Binary types
71 BinaryLargeObject, // BLOB
72 Bit { length: Option<usize> }, // BIT or BIT(n), MySQL compatibility, default length is 1
73
74 // Vector types (for AI/ML workloads)
75 Vector { dimensions: u32 },
76
77 // User-defined types (SQL:1999)
78 UserDefined { type_name: String },
79
80 // Special type for NULL
81 Null,
82}
83
84impl DataType {
85 /// Returns the type precedence for SQL:1999 type coercion
86 ///
87 /// Higher precedence types are preferred in type coercion.
88 /// Based on SQL:1999 Section 9.5 (Result of data type combinations).
89 ///
90 /// Precedence order (highest to lowest):
91 /// 1. Character strings (VARCHAR, CHAR, CLOB, NAME)
92 /// 2. Approximate numerics (DOUBLE PRECISION, REAL, FLOAT)
93 /// 3. Exact numerics with scale (DECIMAL, NUMERIC)
94 /// 4. Exact numerics without scale (BIGINT > INTEGER > SMALLINT, UNSIGNED)
95 /// 5. Boolean
96 /// 6. Temporal types (TIMESTAMP > TIME > DATE)
97 /// 7. Interval types
98 /// 8. Binary types (BLOB)
99 fn type_precedence(&self) -> u8 {
100 match self {
101 // NULL has lowest precedence - coerces to anything
102 DataType::Null => 0,
103
104 // Binary types
105 DataType::BinaryLargeObject => 10,
106 DataType::Bit { .. } => 11, // BIT type, slightly higher than BLOB
107
108 // Interval types
109 DataType::Interval { .. } => 20,
110
111 // Temporal types (ordered by precision)
112 DataType::Date => 30,
113 DataType::Time { .. } => 31,
114 DataType::Timestamp { .. } => 32,
115
116 // Boolean
117 DataType::Boolean => 40,
118
119 // Exact numerics without scale (ordered by size)
120 DataType::Smallint => 50,
121 DataType::Integer => 51,
122 DataType::Bigint => 52,
123 DataType::Unsigned => 52, // Same as BIGINT (both 64-bit)
124
125 // Exact numerics with scale
126 DataType::Decimal { .. } => 60,
127 DataType::Numeric { .. } => 60,
128
129 // Approximate numerics (ordered by precision)
130 DataType::Real => 70,
131 DataType::Float { .. } => 71,
132 DataType::DoublePrecision => 72,
133
134 // Character strings (highest precedence)
135 DataType::Character { .. } => 80,
136 DataType::Varchar { .. } => 81,
137 DataType::Name => 81, // NAME is equivalent to VARCHAR
138 DataType::CharacterLargeObject => 82,
139
140 // Vector types (specialized for AI/ML operations, don't coerce with other types)
141 DataType::Vector { .. } => 65,
142
143 // User-defined types don't participate in implicit coercion
144 DataType::UserDefined { .. } => 255,
145 }
146 }
147
148 /// Determines if implicit type coercion is possible between two types
149 ///
150 /// Returns true if SQL:1999 allows implicit conversion between the types.
151 /// This is more permissive than exact type equality.
152 fn can_implicitly_coerce(&self, other: &DataType) -> bool {
153 // NULL can coerce to/from anything
154 if matches!(self, DataType::Null) || matches!(other, DataType::Null) {
155 return true;
156 }
157
158 match (self, other) {
159 // Same types can always coerce
160 (a, b) if a == b => true,
161
162 // DECIMAL/NUMERIC with different precision/scale can coerce
163 (DataType::Decimal { .. }, DataType::Decimal { .. }) => true,
164 (DataType::Numeric { .. }, DataType::Numeric { .. }) => true,
165
166 // VARCHAR with different lengths can coerce
167 (DataType::Varchar { .. }, DataType::Varchar { .. }) => true,
168
169 // BIT types with different lengths can coerce
170 (DataType::Bit { .. }, DataType::Bit { .. }) => true,
171
172 // BIT can coerce to/from integer types (numeric interpretation)
173 (
174 DataType::Bit { .. },
175 DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
176 ) => true,
177 (
178 DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
179 DataType::Bit { .. },
180 ) => true,
181
182 // BIT can coerce to/from binary types
183 (DataType::Bit { .. }, DataType::BinaryLargeObject) => true,
184 (DataType::BinaryLargeObject, DataType::Bit { .. }) => true,
185
186 // Numeric types can coerce among themselves
187 (DataType::Smallint, DataType::Integer | DataType::Bigint | DataType::Unsigned) => true,
188 (DataType::Integer, DataType::Bigint | DataType::Unsigned) => true,
189 (DataType::Bigint, DataType::Unsigned) => true,
190 (DataType::Unsigned, DataType::Bigint) => true,
191 (
192 DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
193 DataType::Decimal { .. } | DataType::Numeric { .. },
194 ) => true,
195 (
196 DataType::Decimal { .. } | DataType::Numeric { .. },
197 DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
198 ) => true,
199 (
200 DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
201 DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
202 ) => true,
203
204 // Numeric types are bidirectionally coercible (widening and narrowing allowed)
205 (DataType::Bigint | DataType::Unsigned, DataType::Integer | DataType::Smallint) => true,
206 (
207 DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
208 DataType::Decimal { .. } | DataType::Numeric { .. },
209 ) => true,
210 (
211 DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
212 DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
213 ) => true,
214 (
215 DataType::Decimal { .. } | DataType::Numeric { .. },
216 DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
217 ) => true,
218
219 // Character string types can coerce among themselves
220 (
221 DataType::Character { .. },
222 DataType::Varchar { .. } | DataType::Name | DataType::CharacterLargeObject,
223 ) => true,
224 (
225 DataType::Varchar { .. },
226 DataType::Character { .. } | DataType::Name | DataType::CharacterLargeObject,
227 ) => true,
228 (
229 DataType::Name,
230 DataType::Character { .. }
231 | DataType::Varchar { .. }
232 | DataType::CharacterLargeObject,
233 ) => true,
234 (
235 DataType::CharacterLargeObject,
236 DataType::Character { .. } | DataType::Varchar { .. } | DataType::Name,
237 ) => true,
238
239 // Temporal types can coerce among themselves
240 (DataType::Date, DataType::Timestamp { .. }) => true,
241 (DataType::Time { .. }, DataType::Timestamp { .. }) => true,
242 (DataType::Timestamp { .. }, DataType::Date | DataType::Time { .. }) => true,
243
244 // Intervals with different fields can coerce if compatible
245 (DataType::Interval { .. }, DataType::Interval { .. }) => true,
246
247 // Vectors with matching dimensions can coerce
248 (DataType::Vector { dimensions: d1 }, DataType::Vector { dimensions: d2 }) => d1 == d2,
249
250 // User-defined types only coerce to themselves (checked above with ==)
251 // All other combinations cannot coerce
252 _ => false,
253 }
254 }
255
256 /// Coerces two types to their common type according to SQL:1999 rules
257 ///
258 /// Returns the result type that should be used when combining values
259 /// of the two input types, or None if no implicit coercion exists.
260 ///
261 /// Based on SQL:1999 Section 9.5 (Result of data type combinations).
262 pub fn coerce_to_common(&self, other: &DataType) -> Option<DataType> {
263 // NULL coerces to the other type
264 if matches!(self, DataType::Null) {
265 return Some(other.clone());
266 }
267 if matches!(other, DataType::Null) {
268 return Some(self.clone());
269 }
270
271 // Check if coercion is possible
272 if !self.can_implicitly_coerce(other) {
273 return None;
274 }
275
276 // Same types return themselves
277 if self == other {
278 return Some(self.clone());
279 }
280
281 // Special handling for types with precision/scale parameters
282 match (self, other) {
283 // For DECIMAL/NUMERIC, combine precision and scale appropriately
284 (
285 DataType::Decimal { precision: p1, scale: s1 },
286 DataType::Decimal { precision: p2, scale: s2 },
287 ) => {
288 let max_scale = (*s1).max(*s2);
289 let max_precision = (*p1).max(*p2);
290 Some(DataType::Decimal { precision: max_precision, scale: max_scale })
291 }
292 (
293 DataType::Numeric { precision: p1, scale: s1 },
294 DataType::Numeric { precision: p2, scale: s2 },
295 ) => {
296 let max_scale = (*s1).max(*s2);
297 let max_precision = (*p1).max(*p2);
298 Some(DataType::Numeric { precision: max_precision, scale: max_scale })
299 }
300
301 // For VARCHAR, use the larger length (or None for unlimited)
302 (DataType::Varchar { max_length: l1 }, DataType::Varchar { max_length: l2 }) => {
303 let max_length = match (l1, l2) {
304 (None, _) | (_, None) => None,
305 (Some(a), Some(b)) => Some((*a).max(*b)),
306 };
307 Some(DataType::Varchar { max_length })
308 }
309
310 // For BIT, use the larger length (or None for unlimited)
311 (DataType::Bit { length: l1 }, DataType::Bit { length: l2 }) => {
312 let max_length = match (l1, l2) {
313 (None, _) | (_, None) => None,
314 (Some(a), Some(b)) => Some((*a).max(*b)),
315 };
316 Some(DataType::Bit { length: max_length })
317 }
318
319 // Vectors with matching dimensions coerce to themselves
320 (DataType::Vector { dimensions: d1 }, DataType::Vector { dimensions: d2 })
321 if d1 == d2 =>
322 {
323 Some(self.clone())
324 }
325
326 // For all other cases, choose the type with higher precedence
327 _ => {
328 let result = if self.type_precedence() > other.type_precedence() {
329 self.clone()
330 } else {
331 other.clone()
332 };
333 Some(result)
334 }
335 }
336 }
337
338 /// Check if this type is compatible with another type for operations
339 ///
340 /// This now uses SQL:1999 type coercion rules. Types are compatible if
341 /// they can be implicitly coerced to a common type.
342 pub fn is_compatible_with(&self, other: &DataType) -> bool {
343 self.coerce_to_common(other).is_some()
344 }
345
346 /// Returns an estimated size in bytes for values of this type.
347 ///
348 /// This is used for adaptive morsel sizing to maintain cache efficiency.
349 /// Estimates are conservative (may overestimate) to ensure cache fitting.
350 ///
351 /// Returns the estimated size in bytes for a single value of this type,
352 /// including any overhead from the SqlValue enum representation.
353 pub fn estimated_size_bytes(&self) -> usize {
354 // Base overhead for SqlValue enum discriminant + alignment
355 const ENUM_OVERHEAD: usize = 8;
356
357 let value_size = match self {
358 // Fixed-size numeric types
359 DataType::Smallint => 2,
360 DataType::Integer => 4,
361 DataType::Bigint | DataType::Unsigned => 8,
362 DataType::Real => 4,
363 DataType::Float { .. } | DataType::DoublePrecision => 8,
364
365 // Decimal/Numeric - stored as rust_decimal::Decimal (16 bytes)
366 DataType::Decimal { .. } | DataType::Numeric { .. } => 16,
367
368 // Boolean
369 DataType::Boolean => 1,
370
371 // Character types - estimate based on length, with ArcStr overhead
372 DataType::Character { length } => {
373 const ARCSTR_OVERHEAD: usize = 16; // Arc pointer + length
374 ARCSTR_OVERHEAD + length
375 }
376 DataType::Varchar { max_length } => {
377 const ARCSTR_OVERHEAD: usize = 16;
378 // Use max_length if specified, otherwise assume 50 bytes average
379 ARCSTR_OVERHEAD + max_length.unwrap_or(50)
380 }
381 DataType::Name => {
382 const ARCSTR_OVERHEAD: usize = 16;
383 ARCSTR_OVERHEAD + 128 // NAME is VARCHAR(128)
384 }
385 DataType::CharacterLargeObject => {
386 const ARCSTR_OVERHEAD: usize = 16;
387 ARCSTR_OVERHEAD + 1000 // Conservative estimate for CLOB
388 }
389
390 // Date/time types
391 DataType::Date => 4, // i32 for days
392 DataType::Time { .. } => 8, // i64 for nanoseconds
393 DataType::Timestamp { .. } => 8, // i64 for timestamp
394 DataType::Interval { .. } => 16, // IntervalValue struct
395
396 // Binary types
397 DataType::BinaryLargeObject => 1000, // Conservative estimate
398 DataType::Bit { length } => {
399 // Bits stored as bytes, rounded up
400 length.unwrap_or(1).div_ceil(8)
401 }
402
403 // Vector types
404 DataType::Vector { dimensions } => {
405 const VEC_OVERHEAD: usize = 24; // Vec header
406 VEC_OVERHEAD + (*dimensions as usize * 4) // f32 per dimension
407 }
408
409 // Special types
410 DataType::Null => 0,
411 DataType::UserDefined { .. } => 100, // Conservative estimate
412 };
413
414 ENUM_OVERHEAD + value_size
415 }
416
417 /// Returns the SQLite type affinity for this data type.
418 ///
419 /// SQLite determines affinity based on the declared type name:
420 /// 1. If the type contains "INT" → INTEGER affinity
421 /// 2. If the type contains "CHAR", "CLOB", or "TEXT" → TEXT affinity
422 /// 3. If the type contains "BLOB" or has no type → BLOB/NONE affinity
423 /// 4. If the type contains "REAL", "FLOA", or "DOUB" → REAL affinity
424 /// 5. Otherwise → NUMERIC affinity
425 ///
426 /// See: https://www.sqlite.org/datatype3.html#type_affinity
427 pub fn sqlite_affinity(&self) -> TypeAffinity {
428 match self {
429 // Integer types → INTEGER affinity
430 DataType::Integer | DataType::Smallint | DataType::Bigint | DataType::Unsigned => {
431 TypeAffinity::Integer
432 }
433
434 // Character/Text types → TEXT affinity
435 DataType::Character { .. }
436 | DataType::Varchar { .. }
437 | DataType::CharacterLargeObject
438 | DataType::Name => TypeAffinity::Text,
439
440 // Floating-point types → REAL affinity
441 DataType::Real | DataType::Float { .. } | DataType::DoublePrecision => {
442 TypeAffinity::Real
443 }
444
445 // Decimal/Numeric → NUMERIC affinity
446 DataType::Decimal { .. } | DataType::Numeric { .. } => TypeAffinity::Numeric,
447
448 // Binary types → NONE/BLOB affinity
449 DataType::BinaryLargeObject | DataType::Bit { .. } => TypeAffinity::None,
450
451 // Boolean → NUMERIC affinity (SQLite stores as 0/1)
452 DataType::Boolean => TypeAffinity::Numeric,
453
454 // Temporal types → NUMERIC affinity (SQLite stores as numbers or text)
455 DataType::Date | DataType::Time { .. } | DataType::Timestamp { .. } => {
456 TypeAffinity::Numeric
457 }
458
459 // Interval → NUMERIC affinity
460 DataType::Interval { .. } => TypeAffinity::Numeric,
461
462 // Vector → NONE affinity (custom type)
463 DataType::Vector { .. } => TypeAffinity::None,
464
465 // NULL → NONE affinity
466 DataType::Null => TypeAffinity::None,
467
468 // User-defined types: Apply SQLite's affinity rules based on type name.
469 // SQLite determines affinity by checking if the type name contains:
470 // 1. "INT" → INTEGER affinity
471 // 2. "CHAR", "CLOB", or "TEXT" → TEXT affinity
472 // 3. "BLOB" or no type → NONE/BLOB affinity
473 // 4. "REAL", "FLOA", or "DOUB" → REAL affinity
474 // 5. Otherwise → NUMERIC affinity
475 //
476 // This handles multi-word types like "LARGE BLOB", "NATIVE CHARACTER",
477 // "VARYING CHARACTER", "UNSIGNED BIG INT", etc.
478 DataType::UserDefined { type_name } => {
479 let upper = type_name.to_uppercase();
480 if upper.contains("INT") {
481 TypeAffinity::Integer
482 } else if upper.contains("CHAR") || upper.contains("CLOB") || upper.contains("TEXT")
483 {
484 TypeAffinity::Text
485 } else if upper.contains("BLOB") {
486 TypeAffinity::None
487 } else if upper.contains("REAL")
488 || upper.contains("FLOA")
489 || upper.contains("DOUB")
490 {
491 TypeAffinity::Real
492 } else {
493 TypeAffinity::Numeric
494 }
495 }
496 }
497 }
498}