vibesql_types/
data_type.rs

1//! SQL Data Type definitions
2
3use crate::temporal::IntervalField;
4
5/// SQLite Type Affinity
6///
7/// SQLite uses type affinity to determine how values are compared and stored.
8/// This enum represents the five affinity types defined by SQLite:
9/// - TEXT: String comparison semantics
10/// - NUMERIC: Numeric comparison, converts strings to numbers when possible
11/// - INTEGER: Prefers integer storage
12/// - REAL: Floating-point storage
13/// - BLOB/NONE: No affinity, uses type ordering for cross-type comparisons
14///
15/// See: https://www.sqlite.org/datatype3.html
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
17pub enum TypeAffinity {
18    /// TEXT affinity - string comparison semantics
19    /// When comparing TEXT vs INTEGER, converts INTEGER to TEXT
20    Text,
21    /// NUMERIC affinity - tries to convert to number first
22    Numeric,
23    /// INTEGER affinity - prefers integer storage
24    Integer,
25    /// REAL affinity - floating-point storage
26    Real,
27    /// BLOB/NONE affinity - no type preference, uses type ordering
28    /// This is used for bare columns with no declared type
29    None,
30}
31
32/// SQL:1999 Data Types
33///
34/// Represents the type of a column or expression in SQL.
35#[derive(Debug, Clone, PartialEq, Eq)]
36pub enum DataType {
37    // Exact numeric types
38    Integer,
39    Smallint,
40    Bigint,
41    Unsigned, // 64-bit unsigned integer (MySQL compatibility)
42    Numeric { precision: u8, scale: u8 },
43    Decimal { precision: u8, scale: u8 },
44
45    // Approximate numeric types
46    Float { precision: u8 }, // SQL:1999 FLOAT(p), default 53 (double precision)
47    Real,
48    DoublePrecision,
49
50    // Character string types
51    Character { length: usize },
52    Varchar { max_length: Option<usize> }, // None = default length (255)
53    CharacterLargeObject,                  // CLOB
54    Name,                                  /* NAME type for SQL identifiers (SQL:1999), maps to
55                                            * VARCHAR(128) */
56
57    // Boolean type (SQL:1999)
58    Boolean,
59
60    // Date/time types
61    Date,
62    Time { with_timezone: bool },
63    Timestamp { with_timezone: bool },
64
65    // Interval types
66    // Single field: INTERVAL YEAR, INTERVAL MONTH, etc. (end_field is None)
67    // Multi-field: INTERVAL YEAR TO MONTH, INTERVAL DAY TO SECOND, etc.
68    Interval { start_field: IntervalField, end_field: Option<IntervalField> },
69
70    // Binary types
71    BinaryLargeObject,             // BLOB
72    Bit { length: Option<usize> }, // BIT or BIT(n), MySQL compatibility, default length is 1
73
74    // Vector types (for AI/ML workloads)
75    Vector { dimensions: u32 },
76
77    // User-defined types (SQL:1999)
78    UserDefined { type_name: String },
79
80    // Special type for NULL
81    Null,
82}
83
84impl DataType {
85    /// Returns the type precedence for SQL:1999 type coercion
86    ///
87    /// Higher precedence types are preferred in type coercion.
88    /// Based on SQL:1999 Section 9.5 (Result of data type combinations).
89    ///
90    /// Precedence order (highest to lowest):
91    /// 1. Character strings (VARCHAR, CHAR, CLOB, NAME)
92    /// 2. Approximate numerics (DOUBLE PRECISION, REAL, FLOAT)
93    /// 3. Exact numerics with scale (DECIMAL, NUMERIC)
94    /// 4. Exact numerics without scale (BIGINT > INTEGER > SMALLINT, UNSIGNED)
95    /// 5. Boolean
96    /// 6. Temporal types (TIMESTAMP > TIME > DATE)
97    /// 7. Interval types
98    /// 8. Binary types (BLOB)
99    fn type_precedence(&self) -> u8 {
100        match self {
101            // NULL has lowest precedence - coerces to anything
102            DataType::Null => 0,
103
104            // Binary types
105            DataType::BinaryLargeObject => 10,
106            DataType::Bit { .. } => 11, // BIT type, slightly higher than BLOB
107
108            // Interval types
109            DataType::Interval { .. } => 20,
110
111            // Temporal types (ordered by precision)
112            DataType::Date => 30,
113            DataType::Time { .. } => 31,
114            DataType::Timestamp { .. } => 32,
115
116            // Boolean
117            DataType::Boolean => 40,
118
119            // Exact numerics without scale (ordered by size)
120            DataType::Smallint => 50,
121            DataType::Integer => 51,
122            DataType::Bigint => 52,
123            DataType::Unsigned => 52, // Same as BIGINT (both 64-bit)
124
125            // Exact numerics with scale
126            DataType::Decimal { .. } => 60,
127            DataType::Numeric { .. } => 60,
128
129            // Approximate numerics (ordered by precision)
130            DataType::Real => 70,
131            DataType::Float { .. } => 71,
132            DataType::DoublePrecision => 72,
133
134            // Character strings (highest precedence)
135            DataType::Character { .. } => 80,
136            DataType::Varchar { .. } => 81,
137            DataType::Name => 81, // NAME is equivalent to VARCHAR
138            DataType::CharacterLargeObject => 82,
139
140            // Vector types (specialized for AI/ML operations, don't coerce with other types)
141            DataType::Vector { .. } => 65,
142
143            // User-defined types don't participate in implicit coercion
144            DataType::UserDefined { .. } => 255,
145        }
146    }
147
148    /// Determines if implicit type coercion is possible between two types
149    ///
150    /// Returns true if SQL:1999 allows implicit conversion between the types.
151    /// This is more permissive than exact type equality.
152    fn can_implicitly_coerce(&self, other: &DataType) -> bool {
153        // NULL can coerce to/from anything
154        if matches!(self, DataType::Null) || matches!(other, DataType::Null) {
155            return true;
156        }
157
158        match (self, other) {
159            // Same types can always coerce
160            (a, b) if a == b => true,
161
162            // DECIMAL/NUMERIC with different precision/scale can coerce
163            (DataType::Decimal { .. }, DataType::Decimal { .. }) => true,
164            (DataType::Numeric { .. }, DataType::Numeric { .. }) => true,
165
166            // VARCHAR with different lengths can coerce
167            (DataType::Varchar { .. }, DataType::Varchar { .. }) => true,
168
169            // BIT types with different lengths can coerce
170            (DataType::Bit { .. }, DataType::Bit { .. }) => true,
171
172            // BIT can coerce to/from integer types (numeric interpretation)
173            (
174                DataType::Bit { .. },
175                DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
176            ) => true,
177            (
178                DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
179                DataType::Bit { .. },
180            ) => true,
181
182            // BIT can coerce to/from binary types
183            (DataType::Bit { .. }, DataType::BinaryLargeObject) => true,
184            (DataType::BinaryLargeObject, DataType::Bit { .. }) => true,
185
186            // Numeric types can coerce among themselves
187            (DataType::Smallint, DataType::Integer | DataType::Bigint | DataType::Unsigned) => true,
188            (DataType::Integer, DataType::Bigint | DataType::Unsigned) => true,
189            (DataType::Bigint, DataType::Unsigned) => true,
190            (DataType::Unsigned, DataType::Bigint) => true,
191            (
192                DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
193                DataType::Decimal { .. } | DataType::Numeric { .. },
194            ) => true,
195            (
196                DataType::Decimal { .. } | DataType::Numeric { .. },
197                DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
198            ) => true,
199            (
200                DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
201                DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
202            ) => true,
203
204            // Numeric types are bidirectionally coercible (widening and narrowing allowed)
205            (DataType::Bigint | DataType::Unsigned, DataType::Integer | DataType::Smallint) => true,
206            (
207                DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
208                DataType::Decimal { .. } | DataType::Numeric { .. },
209            ) => true,
210            (
211                DataType::Real | DataType::Float { .. } | DataType::DoublePrecision,
212                DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
213            ) => true,
214            (
215                DataType::Decimal { .. } | DataType::Numeric { .. },
216                DataType::Integer | DataType::Bigint | DataType::Unsigned | DataType::Smallint,
217            ) => true,
218
219            // Character string types can coerce among themselves
220            (
221                DataType::Character { .. },
222                DataType::Varchar { .. } | DataType::Name | DataType::CharacterLargeObject,
223            ) => true,
224            (
225                DataType::Varchar { .. },
226                DataType::Character { .. } | DataType::Name | DataType::CharacterLargeObject,
227            ) => true,
228            (
229                DataType::Name,
230                DataType::Character { .. }
231                | DataType::Varchar { .. }
232                | DataType::CharacterLargeObject,
233            ) => true,
234            (
235                DataType::CharacterLargeObject,
236                DataType::Character { .. } | DataType::Varchar { .. } | DataType::Name,
237            ) => true,
238
239            // Temporal types can coerce among themselves
240            (DataType::Date, DataType::Timestamp { .. }) => true,
241            (DataType::Time { .. }, DataType::Timestamp { .. }) => true,
242            (DataType::Timestamp { .. }, DataType::Date | DataType::Time { .. }) => true,
243
244            // Intervals with different fields can coerce if compatible
245            (DataType::Interval { .. }, DataType::Interval { .. }) => true,
246
247            // Vectors with matching dimensions can coerce
248            (DataType::Vector { dimensions: d1 }, DataType::Vector { dimensions: d2 }) => d1 == d2,
249
250            // User-defined types only coerce to themselves (checked above with ==)
251            // All other combinations cannot coerce
252            _ => false,
253        }
254    }
255
256    /// Coerces two types to their common type according to SQL:1999 rules
257    ///
258    /// Returns the result type that should be used when combining values
259    /// of the two input types, or None if no implicit coercion exists.
260    ///
261    /// Based on SQL:1999 Section 9.5 (Result of data type combinations).
262    pub fn coerce_to_common(&self, other: &DataType) -> Option<DataType> {
263        // NULL coerces to the other type
264        if matches!(self, DataType::Null) {
265            return Some(other.clone());
266        }
267        if matches!(other, DataType::Null) {
268            return Some(self.clone());
269        }
270
271        // Check if coercion is possible
272        if !self.can_implicitly_coerce(other) {
273            return None;
274        }
275
276        // Same types return themselves
277        if self == other {
278            return Some(self.clone());
279        }
280
281        // Special handling for types with precision/scale parameters
282        match (self, other) {
283            // For DECIMAL/NUMERIC, combine precision and scale appropriately
284            (
285                DataType::Decimal { precision: p1, scale: s1 },
286                DataType::Decimal { precision: p2, scale: s2 },
287            ) => {
288                let max_scale = (*s1).max(*s2);
289                let max_precision = (*p1).max(*p2);
290                Some(DataType::Decimal { precision: max_precision, scale: max_scale })
291            }
292            (
293                DataType::Numeric { precision: p1, scale: s1 },
294                DataType::Numeric { precision: p2, scale: s2 },
295            ) => {
296                let max_scale = (*s1).max(*s2);
297                let max_precision = (*p1).max(*p2);
298                Some(DataType::Numeric { precision: max_precision, scale: max_scale })
299            }
300
301            // For VARCHAR, use the larger length (or None for unlimited)
302            (DataType::Varchar { max_length: l1 }, DataType::Varchar { max_length: l2 }) => {
303                let max_length = match (l1, l2) {
304                    (None, _) | (_, None) => None,
305                    (Some(a), Some(b)) => Some((*a).max(*b)),
306                };
307                Some(DataType::Varchar { max_length })
308            }
309
310            // For BIT, use the larger length (or None for unlimited)
311            (DataType::Bit { length: l1 }, DataType::Bit { length: l2 }) => {
312                let max_length = match (l1, l2) {
313                    (None, _) | (_, None) => None,
314                    (Some(a), Some(b)) => Some((*a).max(*b)),
315                };
316                Some(DataType::Bit { length: max_length })
317            }
318
319            // Vectors with matching dimensions coerce to themselves
320            (DataType::Vector { dimensions: d1 }, DataType::Vector { dimensions: d2 })
321                if d1 == d2 =>
322            {
323                Some(self.clone())
324            }
325
326            // For all other cases, choose the type with higher precedence
327            _ => {
328                let result = if self.type_precedence() > other.type_precedence() {
329                    self.clone()
330                } else {
331                    other.clone()
332                };
333                Some(result)
334            }
335        }
336    }
337
338    /// Check if this type is compatible with another type for operations
339    ///
340    /// This now uses SQL:1999 type coercion rules. Types are compatible if
341    /// they can be implicitly coerced to a common type.
342    pub fn is_compatible_with(&self, other: &DataType) -> bool {
343        self.coerce_to_common(other).is_some()
344    }
345
346    /// Returns an estimated size in bytes for values of this type.
347    ///
348    /// This is used for adaptive morsel sizing to maintain cache efficiency.
349    /// Estimates are conservative (may overestimate) to ensure cache fitting.
350    ///
351    /// Returns the estimated size in bytes for a single value of this type,
352    /// including any overhead from the SqlValue enum representation.
353    pub fn estimated_size_bytes(&self) -> usize {
354        // Base overhead for SqlValue enum discriminant + alignment
355        const ENUM_OVERHEAD: usize = 8;
356
357        let value_size = match self {
358            // Fixed-size numeric types
359            DataType::Smallint => 2,
360            DataType::Integer => 4,
361            DataType::Bigint | DataType::Unsigned => 8,
362            DataType::Real => 4,
363            DataType::Float { .. } | DataType::DoublePrecision => 8,
364
365            // Decimal/Numeric - stored as rust_decimal::Decimal (16 bytes)
366            DataType::Decimal { .. } | DataType::Numeric { .. } => 16,
367
368            // Boolean
369            DataType::Boolean => 1,
370
371            // Character types - estimate based on length, with ArcStr overhead
372            DataType::Character { length } => {
373                const ARCSTR_OVERHEAD: usize = 16; // Arc pointer + length
374                ARCSTR_OVERHEAD + length
375            }
376            DataType::Varchar { max_length } => {
377                const ARCSTR_OVERHEAD: usize = 16;
378                // Use max_length if specified, otherwise assume 50 bytes average
379                ARCSTR_OVERHEAD + max_length.unwrap_or(50)
380            }
381            DataType::Name => {
382                const ARCSTR_OVERHEAD: usize = 16;
383                ARCSTR_OVERHEAD + 128 // NAME is VARCHAR(128)
384            }
385            DataType::CharacterLargeObject => {
386                const ARCSTR_OVERHEAD: usize = 16;
387                ARCSTR_OVERHEAD + 1000 // Conservative estimate for CLOB
388            }
389
390            // Date/time types
391            DataType::Date => 4,             // i32 for days
392            DataType::Time { .. } => 8,      // i64 for nanoseconds
393            DataType::Timestamp { .. } => 8, // i64 for timestamp
394            DataType::Interval { .. } => 16, // IntervalValue struct
395
396            // Binary types
397            DataType::BinaryLargeObject => 1000, // Conservative estimate
398            DataType::Bit { length } => {
399                // Bits stored as bytes, rounded up
400                length.unwrap_or(1).div_ceil(8)
401            }
402
403            // Vector types
404            DataType::Vector { dimensions } => {
405                const VEC_OVERHEAD: usize = 24; // Vec header
406                VEC_OVERHEAD + (*dimensions as usize * 4) // f32 per dimension
407            }
408
409            // Special types
410            DataType::Null => 0,
411            DataType::UserDefined { .. } => 100, // Conservative estimate
412        };
413
414        ENUM_OVERHEAD + value_size
415    }
416
417    /// Returns the SQLite type affinity for this data type.
418    ///
419    /// SQLite determines affinity based on the declared type name:
420    /// 1. If the type contains "INT" → INTEGER affinity
421    /// 2. If the type contains "CHAR", "CLOB", or "TEXT" → TEXT affinity
422    /// 3. If the type contains "BLOB" or has no type → BLOB/NONE affinity
423    /// 4. If the type contains "REAL", "FLOA", or "DOUB" → REAL affinity
424    /// 5. Otherwise → NUMERIC affinity
425    ///
426    /// See: https://www.sqlite.org/datatype3.html#type_affinity
427    pub fn sqlite_affinity(&self) -> TypeAffinity {
428        match self {
429            // Integer types → INTEGER affinity
430            DataType::Integer | DataType::Smallint | DataType::Bigint | DataType::Unsigned => {
431                TypeAffinity::Integer
432            }
433
434            // Character/Text types → TEXT affinity
435            DataType::Character { .. }
436            | DataType::Varchar { .. }
437            | DataType::CharacterLargeObject
438            | DataType::Name => TypeAffinity::Text,
439
440            // Floating-point types → REAL affinity
441            DataType::Real | DataType::Float { .. } | DataType::DoublePrecision => {
442                TypeAffinity::Real
443            }
444
445            // Decimal/Numeric → NUMERIC affinity
446            DataType::Decimal { .. } | DataType::Numeric { .. } => TypeAffinity::Numeric,
447
448            // Binary types → NONE/BLOB affinity
449            DataType::BinaryLargeObject | DataType::Bit { .. } => TypeAffinity::None,
450
451            // Boolean → NUMERIC affinity (SQLite stores as 0/1)
452            DataType::Boolean => TypeAffinity::Numeric,
453
454            // Temporal types → NUMERIC affinity (SQLite stores as numbers or text)
455            DataType::Date | DataType::Time { .. } | DataType::Timestamp { .. } => {
456                TypeAffinity::Numeric
457            }
458
459            // Interval → NUMERIC affinity
460            DataType::Interval { .. } => TypeAffinity::Numeric,
461
462            // Vector → NONE affinity (custom type)
463            DataType::Vector { .. } => TypeAffinity::None,
464
465            // NULL → NONE affinity
466            DataType::Null => TypeAffinity::None,
467
468            // User-defined types: Apply SQLite's affinity rules based on type name.
469            // SQLite determines affinity by checking if the type name contains:
470            // 1. "INT" → INTEGER affinity
471            // 2. "CHAR", "CLOB", or "TEXT" → TEXT affinity
472            // 3. "BLOB" or no type → NONE/BLOB affinity
473            // 4. "REAL", "FLOA", or "DOUB" → REAL affinity
474            // 5. Otherwise → NUMERIC affinity
475            //
476            // This handles multi-word types like "LARGE BLOB", "NATIVE CHARACTER",
477            // "VARYING CHARACTER", "UNSIGNED BIG INT", etc.
478            DataType::UserDefined { type_name } => {
479                let upper = type_name.to_uppercase();
480                if upper.contains("INT") {
481                    TypeAffinity::Integer
482                } else if upper.contains("CHAR") || upper.contains("CLOB") || upper.contains("TEXT")
483                {
484                    TypeAffinity::Text
485                } else if upper.contains("BLOB") {
486                    TypeAffinity::None
487                } else if upper.contains("REAL")
488                    || upper.contains("FLOA")
489                    || upper.contains("DOUB")
490                {
491                    TypeAffinity::Real
492                } else {
493                    TypeAffinity::Numeric
494                }
495            }
496        }
497    }
498}