sql_cli/sql/generators/
string_generators.rs

1use crate::data::datatable::{DataColumn, DataRow, DataTable, DataType, DataValue};
2use crate::sql::generators::TableGenerator;
3use anyhow::{anyhow, Result};
4use std::collections::HashMap;
5use std::sync::Arc;
6
7/// SPLIT - Split a string into rows based on delimiter
8pub struct Split;
9
10impl TableGenerator for Split {
11    fn name(&self) -> &str {
12        "SPLIT"
13    }
14
15    fn columns(&self) -> Vec<DataColumn> {
16        vec![
17            DataColumn {
18                name: "value".to_string(),
19                data_type: DataType::String,
20                nullable: false,
21                unique_values: Some(0),
22                null_count: 0,
23                metadata: HashMap::new(),
24                qualified_name: None,
25                source_table: None,
26            },
27            DataColumn {
28                name: "index".to_string(),
29                data_type: DataType::Integer,
30                nullable: false,
31                unique_values: Some(0),
32                null_count: 0,
33                metadata: HashMap::new(),
34                qualified_name: None,
35                source_table: None,
36            },
37        ]
38    }
39
40    fn generate(&self, args: Vec<DataValue>) -> Result<Arc<DataTable>> {
41        if args.is_empty() {
42            return Err(anyhow!(
43                "SPLIT requires at least 1 argument (text to split)"
44            ));
45        }
46
47        // Get text to split
48        let text = match &args[0] {
49            DataValue::String(s) => s.clone(),
50            DataValue::Null => return Err(anyhow!("SPLIT text cannot be NULL")),
51            other => other.to_string(),
52        };
53
54        // Get delimiter (default to space)
55        let delimiter = if args.len() > 1 {
56            match &args[1] {
57                DataValue::String(s) => s.clone(),
58                DataValue::Null => " ".to_string(),
59                other => other.to_string(),
60            }
61        } else {
62            " ".to_string()
63        };
64
65        let mut table = DataTable::new("split");
66        table.add_column(DataColumn::new("value"));
67        table.add_column(DataColumn::new("index"));
68
69        // Split the string and create rows
70        if delimiter.is_empty() {
71            // Split into individual characters
72            for (idx, ch) in text.chars().enumerate() {
73                table
74                    .add_row(DataRow::new(vec![
75                        DataValue::String(ch.to_string()),
76                        DataValue::Integer((idx + 1) as i64),
77                    ]))
78                    .map_err(|e| anyhow!(e))?;
79            }
80        } else {
81            // Split by delimiter
82            for (idx, part) in text.split(&delimiter).enumerate() {
83                // Skip empty parts
84                if part.is_empty() {
85                    continue;
86                }
87
88                table
89                    .add_row(DataRow::new(vec![
90                        DataValue::String(part.to_string()),
91                        DataValue::Integer((idx + 1) as i64),
92                    ]))
93                    .map_err(|e| anyhow!(e))?;
94            }
95        }
96
97        Ok(Arc::new(table))
98    }
99
100    fn description(&self) -> &str {
101        "Split a string into rows based on delimiter"
102    }
103
104    fn arg_count(&self) -> usize {
105        2 // text and optional delimiter
106    }
107}
108
109/// TOKENIZE - Extract words/tokens from text (similar to SPLIT but with normalization)
110pub struct Tokenize;
111
112impl TableGenerator for Tokenize {
113    fn name(&self) -> &str {
114        "TOKENIZE"
115    }
116
117    fn columns(&self) -> Vec<DataColumn> {
118        vec![
119            DataColumn {
120                name: "token".to_string(),
121                data_type: DataType::String,
122                nullable: false,
123                unique_values: Some(0),
124                null_count: 0,
125                metadata: HashMap::new(),
126                qualified_name: None,
127                source_table: None,
128            },
129            DataColumn {
130                name: "position".to_string(),
131                data_type: DataType::Integer,
132                nullable: false,
133                unique_values: Some(0),
134                null_count: 0,
135                metadata: HashMap::new(),
136                qualified_name: None,
137                source_table: None,
138            },
139        ]
140    }
141
142    fn generate(&self, args: Vec<DataValue>) -> Result<Arc<DataTable>> {
143        if args.is_empty() {
144            return Err(anyhow!(
145                "TOKENIZE requires at least 1 argument (text to tokenize)"
146            ));
147        }
148
149        // Get text to tokenize
150        let text = match &args[0] {
151            DataValue::String(s) => s.clone(),
152            DataValue::Null => return Err(anyhow!("TOKENIZE text cannot be NULL")),
153            other => other.to_string(),
154        };
155
156        // Get case option (default to preserve case)
157        let case_option = if args.len() > 1 {
158            match &args[1] {
159                DataValue::String(s) => s.to_lowercase(),
160                _ => "preserve".to_string(),
161            }
162        } else {
163            "preserve".to_string()
164        };
165
166        let mut table = DataTable::new("tokenize");
167        table.add_column(DataColumn::new("token"));
168        table.add_column(DataColumn::new("position"));
169
170        // Tokenize by splitting on non-alphanumeric characters
171        let mut tokens = Vec::new();
172        let mut current_token = String::new();
173
174        for ch in text.chars() {
175            if ch.is_alphanumeric() {
176                current_token.push(ch);
177            } else if !current_token.is_empty() {
178                tokens.push(current_token.clone());
179                current_token.clear();
180            }
181        }
182
183        // Don't forget the last token
184        if !current_token.is_empty() {
185            tokens.push(current_token);
186        }
187
188        // Apply case transformation
189        let tokens = match case_option.as_str() {
190            "lower" | "lowercase" => tokens.iter().map(|t| t.to_lowercase()).collect(),
191            "upper" | "uppercase" => tokens.iter().map(|t| t.to_uppercase()).collect(),
192            _ => tokens,
193        };
194
195        // Create rows
196        for (idx, token) in tokens.iter().enumerate() {
197            table
198                .add_row(DataRow::new(vec![
199                    DataValue::String(token.clone()),
200                    DataValue::Integer((idx + 1) as i64),
201                ]))
202                .map_err(|e| anyhow!(e))?;
203        }
204
205        Ok(Arc::new(table))
206    }
207
208    fn description(&self) -> &str {
209        "Extract alphanumeric tokens from text"
210    }
211
212    fn arg_count(&self) -> usize {
213        2 // text and optional case option
214    }
215}
216
217/// CHARS - Split string into individual characters
218pub struct Chars;
219
220impl TableGenerator for Chars {
221    fn name(&self) -> &str {
222        "CHARS"
223    }
224
225    fn columns(&self) -> Vec<DataColumn> {
226        vec![
227            DataColumn {
228                name: "char".to_string(),
229                data_type: DataType::String,
230                nullable: false,
231                unique_values: Some(0),
232                null_count: 0,
233                metadata: HashMap::new(),
234                qualified_name: None,
235                source_table: None,
236            },
237            DataColumn {
238                name: "position".to_string(),
239                data_type: DataType::Integer,
240                nullable: false,
241                unique_values: Some(0),
242                null_count: 0,
243                metadata: HashMap::new(),
244                qualified_name: None,
245                source_table: None,
246            },
247            DataColumn {
248                name: "ascii".to_string(),
249                data_type: DataType::Integer,
250                nullable: false,
251                unique_values: Some(0),
252                null_count: 0,
253                metadata: HashMap::new(),
254                qualified_name: None,
255                source_table: None,
256            },
257        ]
258    }
259
260    fn generate(&self, args: Vec<DataValue>) -> Result<Arc<DataTable>> {
261        if args.is_empty() {
262            return Err(anyhow!("CHARS requires 1 argument (text)"));
263        }
264
265        // Get text
266        let text = match &args[0] {
267            DataValue::String(s) => s.clone(),
268            DataValue::Null => return Err(anyhow!("CHARS text cannot be NULL")),
269            other => other.to_string(),
270        };
271
272        let mut table = DataTable::new("chars");
273        table.add_column(DataColumn::new("char"));
274        table.add_column(DataColumn::new("position"));
275        table.add_column(DataColumn::new("ascii"));
276
277        // Create a row for each character
278        for (idx, ch) in text.chars().enumerate() {
279            table
280                .add_row(DataRow::new(vec![
281                    DataValue::String(ch.to_string()),
282                    DataValue::Integer((idx + 1) as i64),
283                    DataValue::Integer(ch as i64),
284                ]))
285                .map_err(|e| anyhow!(e))?;
286        }
287
288        Ok(Arc::new(table))
289    }
290
291    fn description(&self) -> &str {
292        "Split string into individual characters with ASCII codes"
293    }
294
295    fn arg_count(&self) -> usize {
296        1
297    }
298}
299
300/// LINES - Split text into lines
301pub struct Lines;
302
303impl TableGenerator for Lines {
304    fn name(&self) -> &str {
305        "LINES"
306    }
307
308    fn columns(&self) -> Vec<DataColumn> {
309        vec![
310            DataColumn {
311                name: "line".to_string(),
312                data_type: DataType::String,
313                nullable: false,
314                unique_values: Some(0),
315                null_count: 0,
316                metadata: HashMap::new(),
317                qualified_name: None,
318                source_table: None,
319            },
320            DataColumn {
321                name: "line_number".to_string(),
322                data_type: DataType::Integer,
323                nullable: false,
324                unique_values: Some(0),
325                null_count: 0,
326                metadata: HashMap::new(),
327                qualified_name: None,
328                source_table: None,
329            },
330        ]
331    }
332
333    fn generate(&self, args: Vec<DataValue>) -> Result<Arc<DataTable>> {
334        if args.is_empty() {
335            return Err(anyhow!("LINES requires 1 argument (text)"));
336        }
337
338        // Get text
339        let text = match &args[0] {
340            DataValue::String(s) => s.clone(),
341            DataValue::Null => return Err(anyhow!("LINES text cannot be NULL")),
342            other => other.to_string(),
343        };
344
345        let mut table = DataTable::new("lines");
346        table.add_column(DataColumn::new("line"));
347        table.add_column(DataColumn::new("line_number"));
348
349        // Split into lines
350        for (idx, line) in text.lines().enumerate() {
351            table
352                .add_row(DataRow::new(vec![
353                    DataValue::String(line.to_string()),
354                    DataValue::Integer((idx + 1) as i64),
355                ]))
356                .map_err(|e| anyhow!(e))?;
357        }
358
359        Ok(Arc::new(table))
360    }
361
362    fn description(&self) -> &str {
363        "Split text into lines"
364    }
365
366    fn arg_count(&self) -> usize {
367        1
368    }
369}