Skip to main content

sql_splitter/duckdb/
types.rs

1//! Type conversion from MySQL/PostgreSQL/SQLite/MSSQL types to DuckDB types.
2
3use once_cell::sync::Lazy;
4use regex::Regex;
5
6/// DuckDB native types
7#[derive(Debug, Clone, PartialEq)]
8#[allow(dead_code)]
9pub enum DuckDBType {
10    Boolean,
11    TinyInt,
12    SmallInt,
13    Integer,
14    BigInt,
15    Float,
16    Double,
17    Decimal(Option<u8>, Option<u8>),
18    Varchar(Option<u32>),
19    Text,
20    Blob,
21    Date,
22    Time,
23    Timestamp,
24    Interval,
25    Uuid,
26    Json,
27}
28
29impl std::fmt::Display for DuckDBType {
30    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
31        match self {
32            DuckDBType::Boolean => write!(f, "BOOLEAN"),
33            DuckDBType::TinyInt => write!(f, "TINYINT"),
34            DuckDBType::SmallInt => write!(f, "SMALLINT"),
35            DuckDBType::Integer => write!(f, "INTEGER"),
36            DuckDBType::BigInt => write!(f, "BIGINT"),
37            DuckDBType::Float => write!(f, "FLOAT"),
38            DuckDBType::Double => write!(f, "DOUBLE"),
39            DuckDBType::Decimal(p, s) => match (p, s) {
40                (Some(p), Some(s)) => write!(f, "DECIMAL({}, {})", p, s),
41                (Some(p), None) => write!(f, "DECIMAL({})", p),
42                _ => write!(f, "DECIMAL"),
43            },
44            DuckDBType::Varchar(len) => match len {
45                Some(n) => write!(f, "VARCHAR({})", n),
46                None => write!(f, "VARCHAR"),
47            },
48            DuckDBType::Text => write!(f, "TEXT"),
49            DuckDBType::Blob => write!(f, "BLOB"),
50            DuckDBType::Date => write!(f, "DATE"),
51            DuckDBType::Time => write!(f, "TIME"),
52            DuckDBType::Timestamp => write!(f, "TIMESTAMP"),
53            DuckDBType::Interval => write!(f, "INTERVAL"),
54            DuckDBType::Uuid => write!(f, "UUID"),
55            DuckDBType::Json => write!(f, "JSON"),
56        }
57    }
58}
59
60/// Converter for SQL types to DuckDB-compatible types
61pub struct TypeConverter;
62
63impl TypeConverter {
64    /// Convert a SQL type string to a DuckDB-compatible type string
65    pub fn convert(type_str: &str) -> String {
66        let upper = type_str.to_uppercase();
67        let trimmed = upper.trim();
68
69        // Handle common MySQL types
70        match trimmed {
71            // Boolean
72            "BOOL" | "BOOLEAN" => "BOOLEAN".to_string(),
73            "TINYINT(1)" => "BOOLEAN".to_string(),
74
75            // Integer types
76            "TINYINT" => "TINYINT".to_string(),
77            "SMALLINT" => "SMALLINT".to_string(),
78            "MEDIUMINT" => "INTEGER".to_string(),
79            "INT" | "INTEGER" => "INTEGER".to_string(),
80            "BIGINT" => "BIGINT".to_string(),
81
82            // MySQL UNSIGNED variants - DuckDB has unsigned types
83            "TINYINT UNSIGNED" => "UTINYINT".to_string(),
84            "SMALLINT UNSIGNED" => "USMALLINT".to_string(),
85            "MEDIUMINT UNSIGNED" => "UINTEGER".to_string(),
86            "INT UNSIGNED" | "INTEGER UNSIGNED" => "UINTEGER".to_string(),
87            "BIGINT UNSIGNED" => "UBIGINT".to_string(),
88
89            // PostgreSQL serial types
90            "SERIAL" => "INTEGER".to_string(),
91            "BIGSERIAL" => "BIGINT".to_string(),
92            "SMALLSERIAL" => "SMALLINT".to_string(),
93
94            // Floating point
95            "FLOAT" | "FLOAT4" | "REAL" => "FLOAT".to_string(),
96            "DOUBLE" | "DOUBLE PRECISION" | "FLOAT8" => "DOUBLE".to_string(),
97
98            // Decimal/Numeric
99            "DECIMAL" | "NUMERIC" | "DEC" | "FIXED" => "DECIMAL".to_string(),
100            "MONEY" => "DECIMAL(19,4)".to_string(),
101
102            // String types
103            "CHAR" => "VARCHAR".to_string(),
104            "VARCHAR" | "CHARACTER VARYING" => "VARCHAR".to_string(),
105            "TINYTEXT" => "VARCHAR(255)".to_string(),
106            "TEXT" | "MEDIUMTEXT" | "LONGTEXT" => "TEXT".to_string(),
107
108            // Binary types
109            "BINARY" | "VARBINARY" => "BLOB".to_string(),
110            "TINYBLOB" | "BLOB" | "MEDIUMBLOB" | "LONGBLOB" => "BLOB".to_string(),
111            "BYTEA" => "BLOB".to_string(),
112
113            // Date/Time types
114            "DATE" => "DATE".to_string(),
115            "TIME" | "TIME WITHOUT TIME ZONE" => "TIME".to_string(),
116            "TIMETZ" | "TIME WITH TIME ZONE" => "TIMETZ".to_string(),
117            "DATETIME" => "TIMESTAMP".to_string(),
118            "TIMESTAMP" | "TIMESTAMP WITHOUT TIME ZONE" => "TIMESTAMP".to_string(),
119            "TIMESTAMPTZ" | "TIMESTAMP WITH TIME ZONE" => "TIMESTAMPTZ".to_string(),
120            "YEAR" => "SMALLINT".to_string(),
121            "INTERVAL" => "INTERVAL".to_string(),
122
123            // JSON
124            "JSON" | "JSONB" => "JSON".to_string(),
125
126            // UUID
127            "UUID" => "UUID".to_string(),
128
129            // Bit types
130            "BIT" => "BOOLEAN".to_string(),
131
132            // MSSQL-specific types
133            "NCHAR" => "VARCHAR".to_string(),
134            "NVARCHAR" => "VARCHAR".to_string(),
135            "NTEXT" => "TEXT".to_string(),
136            "DATETIME2" => "TIMESTAMP".to_string(),
137            "SMALLDATETIME" => "TIMESTAMP".to_string(),
138            "DATETIMEOFFSET" => "TIMESTAMPTZ".to_string(),
139            "UNIQUEIDENTIFIER" => "UUID".to_string(),
140            "IMAGE" => "BLOB".to_string(),
141            "ROWVERSION" => "BLOB".to_string(),
142            "SMALLMONEY" => "DECIMAL(10,4)".to_string(),
143            "SQL_VARIANT" => "VARCHAR".to_string(),
144            "XML" => "TEXT".to_string(),
145            "HIERARCHYID" => "VARCHAR".to_string(),
146            "GEOGRAPHY" => "BLOB".to_string(),
147            "GEOMETRY" => "BLOB".to_string(),
148
149            _ => Self::convert_parameterized(trimmed),
150        }
151    }
152
153    /// Handle parameterized types like VARCHAR(255), DECIMAL(10,2), etc.
154    fn convert_parameterized(type_str: &str) -> String {
155        // Pattern for types with parameters
156        static RE_PARAMETERIZED: Lazy<Regex> =
157            Lazy::new(|| Regex::new(r"^(\w+)\s*\(([^)]+)\)(.*)$").unwrap());
158
159        if let Some(caps) = RE_PARAMETERIZED.captures(type_str) {
160            let base_type = caps.get(1).map(|m| m.as_str()).unwrap_or("");
161            let params = caps.get(2).map(|m| m.as_str()).unwrap_or("");
162            let suffix = caps.get(3).map(|m| m.as_str()).unwrap_or("").trim();
163
164            let converted_base = match base_type {
165                // Integer types with display width - strip the width
166                "TINYINT" => {
167                    if params == "1" {
168                        return "BOOLEAN".to_string();
169                    }
170                    if suffix.contains("UNSIGNED") {
171                        "UTINYINT"
172                    } else {
173                        "TINYINT"
174                    }
175                }
176                "SMALLINT" => {
177                    if suffix.contains("UNSIGNED") {
178                        "USMALLINT"
179                    } else {
180                        "SMALLINT"
181                    }
182                }
183                "MEDIUMINT" => {
184                    if suffix.contains("UNSIGNED") {
185                        "UINTEGER"
186                    } else {
187                        "INTEGER"
188                    }
189                }
190                "INT" | "INTEGER" => {
191                    if suffix.contains("UNSIGNED") {
192                        "UINTEGER"
193                    } else {
194                        "INTEGER"
195                    }
196                }
197                "BIGINT" => {
198                    if suffix.contains("UNSIGNED") {
199                        "UBIGINT"
200                    } else {
201                        "BIGINT"
202                    }
203                }
204
205                // String types - preserve length
206                "CHAR" | "CHARACTER" => {
207                    return format!("VARCHAR({})", params);
208                }
209                "VARCHAR" | "CHARACTER VARYING" => {
210                    // Handle MSSQL VARCHAR(MAX)
211                    if params.to_uppercase() == "MAX" {
212                        return "TEXT".to_string();
213                    }
214                    return format!("VARCHAR({})", params);
215                }
216                "BINARY" | "VARBINARY" => {
217                    // Handle MSSQL VARBINARY(MAX)
218                    if params.to_uppercase() == "MAX" {
219                        return "BLOB".to_string();
220                    }
221                    return "BLOB".to_string();
222                }
223
224                // Decimal types - preserve precision and scale
225                "DECIMAL" | "NUMERIC" | "DEC" | "FIXED" => {
226                    return format!("DECIMAL({})", params);
227                }
228
229                // Float types with precision
230                "FLOAT" => {
231                    if let Ok(precision) = params.parse::<u32>() {
232                        if precision <= 24 {
233                            return "FLOAT".to_string();
234                        } else {
235                            return "DOUBLE".to_string();
236                        }
237                    }
238                    return "FLOAT".to_string();
239                }
240                "DOUBLE" => return "DOUBLE".to_string(),
241
242                // Time types with precision - DuckDB supports them
243                "TIME" => return "TIME".to_string(),
244                "TIMESTAMP" | "DATETIME" => return "TIMESTAMP".to_string(),
245
246                // BIT fields
247                "BIT" => {
248                    if params == "1" {
249                        return "BOOLEAN".to_string();
250                    }
251                    return "BITSTRING".to_string();
252                }
253
254                // ENUM - convert to VARCHAR with comment
255                "ENUM" => return "VARCHAR".to_string(),
256
257                // SET - convert to VARCHAR
258                "SET" => return "VARCHAR".to_string(),
259
260                // MSSQL NVARCHAR/NCHAR - treat like VARCHAR
261                "NVARCHAR" => {
262                    if params.to_uppercase() == "MAX" {
263                        return "TEXT".to_string();
264                    }
265                    return format!("VARCHAR({})", params);
266                }
267                "NCHAR" => return format!("VARCHAR({})", params),
268
269                // MSSQL DATETIME2(precision)
270                "DATETIME2" => return "TIMESTAMP".to_string(),
271
272                _ => return type_str.to_string(),
273            };
274
275            return converted_base.to_string();
276        }
277
278        // Unknown type - pass through as-is
279        type_str.to_string()
280    }
281}
282
283#[cfg(test)]
284mod tests {
285    use super::*;
286
287    #[test]
288    fn test_basic_type_conversion() {
289        assert_eq!(TypeConverter::convert("INT"), "INTEGER");
290        assert_eq!(TypeConverter::convert("BIGINT"), "BIGINT");
291        assert_eq!(TypeConverter::convert("VARCHAR"), "VARCHAR");
292        assert_eq!(TypeConverter::convert("TEXT"), "TEXT");
293        assert_eq!(TypeConverter::convert("DATETIME"), "TIMESTAMP");
294        assert_eq!(TypeConverter::convert("BYTEA"), "BLOB");
295    }
296
297    #[test]
298    fn test_parameterized_types() {
299        assert_eq!(TypeConverter::convert("VARCHAR(255)"), "VARCHAR(255)");
300        assert_eq!(TypeConverter::convert("DECIMAL(10,2)"), "DECIMAL(10,2)");
301        assert_eq!(TypeConverter::convert("CHAR(1)"), "VARCHAR(1)");
302        assert_eq!(TypeConverter::convert("TINYINT(1)"), "BOOLEAN");
303    }
304
305    #[test]
306    fn test_unsigned_types() {
307        assert_eq!(TypeConverter::convert("INT UNSIGNED"), "UINTEGER");
308        assert_eq!(TypeConverter::convert("BIGINT UNSIGNED"), "UBIGINT");
309        assert_eq!(TypeConverter::convert("TINYINT(3) UNSIGNED"), "UTINYINT");
310    }
311
312    #[test]
313    fn test_mysql_specific() {
314        assert_eq!(TypeConverter::convert("MEDIUMINT"), "INTEGER");
315        assert_eq!(TypeConverter::convert("LONGTEXT"), "TEXT");
316        assert_eq!(TypeConverter::convert("MEDIUMBLOB"), "BLOB");
317        assert_eq!(TypeConverter::convert("YEAR"), "SMALLINT");
318    }
319
320    #[test]
321    fn test_postgres_specific() {
322        assert_eq!(TypeConverter::convert("SERIAL"), "INTEGER");
323        assert_eq!(TypeConverter::convert("BIGSERIAL"), "BIGINT");
324        assert_eq!(TypeConverter::convert("JSONB"), "JSON");
325        assert_eq!(TypeConverter::convert("UUID"), "UUID");
326    }
327
328    #[test]
329    fn test_enum_set() {
330        assert_eq!(TypeConverter::convert("ENUM('a','b','c')"), "VARCHAR");
331        assert_eq!(TypeConverter::convert("SET('x','y')"), "VARCHAR");
332    }
333}