Skip to main content

sql_splitter/duckdb/
types.rs

1//! Type conversion from MySQL/PostgreSQL/SQLite/MSSQL types to DuckDB types.
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6/// DuckDB native types
7#[derive(Debug, Clone, PartialEq)]
8pub enum DuckDBType {
9    Boolean,
10    TinyInt,
11    SmallInt,
12    Integer,
13    BigInt,
14    Float,
15    Double,
16    Decimal(Option<u8>, Option<u8>),
17    Varchar(Option<u32>),
18    Text,
19    Blob,
20    Date,
21    Time,
22    Timestamp,
23    Interval,
24    Uuid,
25    Json,
26}
27
28impl std::fmt::Display for DuckDBType {
29    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
30        match self {
31            DuckDBType::Boolean => write!(f, "BOOLEAN"),
32            DuckDBType::TinyInt => write!(f, "TINYINT"),
33            DuckDBType::SmallInt => write!(f, "SMALLINT"),
34            DuckDBType::Integer => write!(f, "INTEGER"),
35            DuckDBType::BigInt => write!(f, "BIGINT"),
36            DuckDBType::Float => write!(f, "FLOAT"),
37            DuckDBType::Double => write!(f, "DOUBLE"),
38            DuckDBType::Decimal(p, s) => match (p, s) {
39                (Some(p), Some(s)) => write!(f, "DECIMAL({}, {})", p, s),
40                (Some(p), None) => write!(f, "DECIMAL({})", p),
41                _ => write!(f, "DECIMAL"),
42            },
43            DuckDBType::Varchar(len) => match len {
44                Some(n) => write!(f, "VARCHAR({})", n),
45                None => write!(f, "VARCHAR"),
46            },
47            DuckDBType::Text => write!(f, "TEXT"),
48            DuckDBType::Blob => write!(f, "BLOB"),
49            DuckDBType::Date => write!(f, "DATE"),
50            DuckDBType::Time => write!(f, "TIME"),
51            DuckDBType::Timestamp => write!(f, "TIMESTAMP"),
52            DuckDBType::Interval => write!(f, "INTERVAL"),
53            DuckDBType::Uuid => write!(f, "UUID"),
54            DuckDBType::Json => write!(f, "JSON"),
55        }
56    }
57}
58
59/// Converter for SQL types to DuckDB-compatible types
60pub struct TypeConverter;
61
62impl TypeConverter {
63    /// Convert a SQL type string to a DuckDB-compatible type string
64    pub fn convert(type_str: &str) -> String {
65        let upper = type_str.to_uppercase();
66        let trimmed = upper.trim();
67
68        // Handle common MySQL types
69        match trimmed {
70            // Boolean
71            "BOOL" | "BOOLEAN" => "BOOLEAN".to_string(),
72            "TINYINT(1)" => "BOOLEAN".to_string(),
73
74            // Integer types
75            "TINYINT" => "TINYINT".to_string(),
76            "SMALLINT" => "SMALLINT".to_string(),
77            "MEDIUMINT" => "INTEGER".to_string(),
78            "INT" | "INTEGER" => "INTEGER".to_string(),
79            "BIGINT" => "BIGINT".to_string(),
80
81            // MySQL UNSIGNED variants - DuckDB has unsigned types
82            "TINYINT UNSIGNED" => "UTINYINT".to_string(),
83            "SMALLINT UNSIGNED" => "USMALLINT".to_string(),
84            "MEDIUMINT UNSIGNED" => "UINTEGER".to_string(),
85            "INT UNSIGNED" | "INTEGER UNSIGNED" => "UINTEGER".to_string(),
86            "BIGINT UNSIGNED" => "UBIGINT".to_string(),
87
88            // PostgreSQL serial types
89            "SERIAL" => "INTEGER".to_string(),
90            "BIGSERIAL" => "BIGINT".to_string(),
91            "SMALLSERIAL" => "SMALLINT".to_string(),
92
93            // Floating point
94            "FLOAT" | "FLOAT4" | "REAL" => "FLOAT".to_string(),
95            "DOUBLE" | "DOUBLE PRECISION" | "FLOAT8" => "DOUBLE".to_string(),
96
97            // Decimal/Numeric
98            "DECIMAL" | "NUMERIC" | "DEC" | "FIXED" => "DECIMAL".to_string(),
99            "MONEY" => "DECIMAL(19,4)".to_string(),
100
101            // String types
102            "CHAR" => "VARCHAR".to_string(),
103            "VARCHAR" | "CHARACTER VARYING" => "VARCHAR".to_string(),
104            "TINYTEXT" => "VARCHAR(255)".to_string(),
105            "TEXT" | "MEDIUMTEXT" | "LONGTEXT" => "TEXT".to_string(),
106
107            // Binary types
108            "BINARY" | "VARBINARY" => "BLOB".to_string(),
109            "TINYBLOB" | "BLOB" | "MEDIUMBLOB" | "LONGBLOB" => "BLOB".to_string(),
110            "BYTEA" => "BLOB".to_string(),
111
112            // Date/Time types
113            "DATE" => "DATE".to_string(),
114            "TIME" | "TIME WITHOUT TIME ZONE" => "TIME".to_string(),
115            "TIMETZ" | "TIME WITH TIME ZONE" => "TIMETZ".to_string(),
116            "DATETIME" => "TIMESTAMP".to_string(),
117            "TIMESTAMP" | "TIMESTAMP WITHOUT TIME ZONE" => "TIMESTAMP".to_string(),
118            "TIMESTAMPTZ" | "TIMESTAMP WITH TIME ZONE" => "TIMESTAMPTZ".to_string(),
119            "YEAR" => "SMALLINT".to_string(),
120            "INTERVAL" => "INTERVAL".to_string(),
121
122            // JSON
123            "JSON" | "JSONB" => "JSON".to_string(),
124
125            // UUID
126            "UUID" => "UUID".to_string(),
127
128            // Bit types
129            "BIT" => "BOOLEAN".to_string(),
130
131            // MSSQL-specific types
132            "NCHAR" => "VARCHAR".to_string(),
133            "NVARCHAR" => "VARCHAR".to_string(),
134            "NTEXT" => "TEXT".to_string(),
135            "DATETIME2" => "TIMESTAMP".to_string(),
136            "SMALLDATETIME" => "TIMESTAMP".to_string(),
137            "DATETIMEOFFSET" => "TIMESTAMPTZ".to_string(),
138            "UNIQUEIDENTIFIER" => "UUID".to_string(),
139            "IMAGE" => "BLOB".to_string(),
140            "ROWVERSION" => "BLOB".to_string(),
141            "SMALLMONEY" => "DECIMAL(10,4)".to_string(),
142            "SQL_VARIANT" => "VARCHAR".to_string(),
143            "XML" => "TEXT".to_string(),
144            "HIERARCHYID" => "VARCHAR".to_string(),
145            "GEOGRAPHY" => "BLOB".to_string(),
146            "GEOMETRY" => "BLOB".to_string(),
147
148            _ => Self::convert_parameterized(trimmed),
149        }
150    }
151
152    /// Handle parameterized types like VARCHAR(255), DECIMAL(10,2), etc.
153    fn convert_parameterized(type_str: &str) -> String {
154        // Pattern for types with parameters
155        static RE_PARAMETERIZED: Lazy<Regex> =
156            Lazy::new(|| Regex::new(r"^(\w+)\s*\(([^)]+)\)(.*)$").unwrap());
157
158        if let Some(caps) = RE_PARAMETERIZED.captures(type_str) {
159            let base_type = caps.get(1).map(|m| m.as_str()).unwrap_or("");
160            let params = caps.get(2).map(|m| m.as_str()).unwrap_or("");
161            let suffix = caps.get(3).map(|m| m.as_str()).unwrap_or("").trim();
162
163            let converted_base = match base_type {
164                // Integer types with display width - strip the width
165                "TINYINT" => {
166                    if params == "1" {
167                        return "BOOLEAN".to_string();
168                    }
169                    if suffix.contains("UNSIGNED") {
170                        "UTINYINT"
171                    } else {
172                        "TINYINT"
173                    }
174                }
175                "SMALLINT" => {
176                    if suffix.contains("UNSIGNED") {
177                        "USMALLINT"
178                    } else {
179                        "SMALLINT"
180                    }
181                }
182                "MEDIUMINT" => {
183                    if suffix.contains("UNSIGNED") {
184                        "UINTEGER"
185                    } else {
186                        "INTEGER"
187                    }
188                }
189                "INT" | "INTEGER" => {
190                    if suffix.contains("UNSIGNED") {
191                        "UINTEGER"
192                    } else {
193                        "INTEGER"
194                    }
195                }
196                "BIGINT" => {
197                    if suffix.contains("UNSIGNED") {
198                        "UBIGINT"
199                    } else {
200                        "BIGINT"
201                    }
202                }
203
204                // String types - preserve length
205                "CHAR" | "CHARACTER" => {
206                    return format!("VARCHAR({})", params);
207                }
208                "VARCHAR" | "CHARACTER VARYING" => {
209                    // Handle MSSQL VARCHAR(MAX)
210                    if params.to_uppercase() == "MAX" {
211                        return "TEXT".to_string();
212                    }
213                    return format!("VARCHAR({})", params);
214                }
215                "BINARY" | "VARBINARY" => {
216                    // Handle MSSQL VARBINARY(MAX)
217                    if params.to_uppercase() == "MAX" {
218                        return "BLOB".to_string();
219                    }
220                    return "BLOB".to_string();
221                }
222
223                // Decimal types - preserve precision and scale
224                "DECIMAL" | "NUMERIC" | "DEC" | "FIXED" => {
225                    return format!("DECIMAL({})", params);
226                }
227
228                // Float types with precision
229                "FLOAT" => {
230                    if let Ok(precision) = params.parse::<u32>() {
231                        if precision <= 24 {
232                            return "FLOAT".to_string();
233                        } else {
234                            return "DOUBLE".to_string();
235                        }
236                    }
237                    return "FLOAT".to_string();
238                }
239                "DOUBLE" => return "DOUBLE".to_string(),
240
241                // Time types with precision - DuckDB supports them
242                "TIME" => return "TIME".to_string(),
243                "TIMESTAMP" | "DATETIME" => return "TIMESTAMP".to_string(),
244
245                // BIT fields
246                "BIT" => {
247                    if params == "1" {
248                        return "BOOLEAN".to_string();
249                    }
250                    return "BITSTRING".to_string();
251                }
252
253                // ENUM - convert to VARCHAR with comment
254                "ENUM" => return "VARCHAR".to_string(),
255
256                // SET - convert to VARCHAR
257                "SET" => return "VARCHAR".to_string(),
258
259                // MSSQL NVARCHAR/NCHAR - treat like VARCHAR
260                "NVARCHAR" => {
261                    if params.to_uppercase() == "MAX" {
262                        return "TEXT".to_string();
263                    }
264                    return format!("VARCHAR({})", params);
265                }
266                "NCHAR" => return format!("VARCHAR({})", params),
267
268                // MSSQL DATETIME2(precision)
269                "DATETIME2" => return "TIMESTAMP".to_string(),
270
271                _ => return type_str.to_string(),
272            };
273
274            return converted_base.to_string();
275        }
276
277        // Unknown type - pass through as-is
278        type_str.to_string()
279    }
280
281    /// Convert an entire column definition
282    pub fn convert_column_def(column_def: &str) -> String {
283        // Handle AUTO_INCREMENT
284        let mut result = column_def.to_string();
285
286        // Replace type with converted type
287        static RE_TYPE: Lazy<Regex> = Lazy::new(|| {
288            Regex::new(r"(?i)^(\s*`?[^`\s]+`?\s+)([A-Z][A-Z0-9_\s(),']+?)(\s+|$)").unwrap()
289        });
290
291        if let Some(caps) = RE_TYPE.captures(&result) {
292            if let Some(type_match) = caps.get(2) {
293                let original_type = type_match.as_str().trim();
294                let converted_type = Self::convert(original_type);
295                result = result.replacen(original_type, &converted_type, 1);
296            }
297        }
298
299        // Remove AUTO_INCREMENT (DuckDB handles this differently)
300        result = result.replace("AUTO_INCREMENT", "");
301        result = result.replace("auto_increment", "");
302
303        // Remove UNSIGNED (already handled in type conversion)
304        result = result.replace(" UNSIGNED", "");
305        result = result.replace(" unsigned", "");
306
307        // Remove ZEROFILL
308        result = result.replace(" ZEROFILL", "");
309        result = result.replace(" zerofill", "");
310
311        // Remove ON UPDATE CURRENT_TIMESTAMP
312        static RE_ON_UPDATE: Lazy<Regex> =
313            Lazy::new(|| Regex::new(r"(?i)\s*ON\s+UPDATE\s+CURRENT_TIMESTAMP").unwrap());
314        result = RE_ON_UPDATE.replace_all(&result, "").to_string();
315
316        // Remove CHARACTER SET
317        static RE_CHARSET: Lazy<Regex> =
318            Lazy::new(|| Regex::new(r"(?i)\s*CHARACTER\s+SET\s+\w+").unwrap());
319        result = RE_CHARSET.replace_all(&result, "").to_string();
320
321        // Remove COLLATE
322        static RE_COLLATE: Lazy<Regex> = Lazy::new(|| Regex::new(r"(?i)\s*COLLATE\s+\w+").unwrap());
323        result = RE_COLLATE.replace_all(&result, "").to_string();
324
325        // Clean up multiple spaces
326        static RE_SPACES: Lazy<Regex> = Lazy::new(|| Regex::new(r"\s+").unwrap());
327        result = RE_SPACES.replace_all(&result, " ").trim().to_string();
328
329        result
330    }
331}
332
333#[cfg(test)]
334mod tests {
335    use super::*;
336
337    #[test]
338    fn test_basic_type_conversion() {
339        assert_eq!(TypeConverter::convert("INT"), "INTEGER");
340        assert_eq!(TypeConverter::convert("BIGINT"), "BIGINT");
341        assert_eq!(TypeConverter::convert("VARCHAR"), "VARCHAR");
342        assert_eq!(TypeConverter::convert("TEXT"), "TEXT");
343        assert_eq!(TypeConverter::convert("DATETIME"), "TIMESTAMP");
344        assert_eq!(TypeConverter::convert("BYTEA"), "BLOB");
345    }
346
347    #[test]
348    fn test_parameterized_types() {
349        assert_eq!(TypeConverter::convert("VARCHAR(255)"), "VARCHAR(255)");
350        assert_eq!(TypeConverter::convert("DECIMAL(10,2)"), "DECIMAL(10,2)");
351        assert_eq!(TypeConverter::convert("CHAR(1)"), "VARCHAR(1)");
352        assert_eq!(TypeConverter::convert("TINYINT(1)"), "BOOLEAN");
353    }
354
355    #[test]
356    fn test_unsigned_types() {
357        assert_eq!(TypeConverter::convert("INT UNSIGNED"), "UINTEGER");
358        assert_eq!(TypeConverter::convert("BIGINT UNSIGNED"), "UBIGINT");
359        assert_eq!(TypeConverter::convert("TINYINT(3) UNSIGNED"), "UTINYINT");
360    }
361
362    #[test]
363    fn test_mysql_specific() {
364        assert_eq!(TypeConverter::convert("MEDIUMINT"), "INTEGER");
365        assert_eq!(TypeConverter::convert("LONGTEXT"), "TEXT");
366        assert_eq!(TypeConverter::convert("MEDIUMBLOB"), "BLOB");
367        assert_eq!(TypeConverter::convert("YEAR"), "SMALLINT");
368    }
369
370    #[test]
371    fn test_postgres_specific() {
372        assert_eq!(TypeConverter::convert("SERIAL"), "INTEGER");
373        assert_eq!(TypeConverter::convert("BIGSERIAL"), "BIGINT");
374        assert_eq!(TypeConverter::convert("JSONB"), "JSON");
375        assert_eq!(TypeConverter::convert("UUID"), "UUID");
376    }
377
378    #[test]
379    fn test_enum_set() {
380        assert_eq!(TypeConverter::convert("ENUM('a','b','c')"), "VARCHAR");
381        assert_eq!(TypeConverter::convert("SET('x','y')"), "VARCHAR");
382    }
383}