sql_cli/sql/functions/
text_processing.rs

1use crate::data::datatable::DataValue;
2use crate::sql::functions::{ArgCount, FunctionCategory, FunctionSignature, SqlFunction};
3use anyhow::Result;
4
5/// Strip all punctuation from text
6pub struct StripPunctuation;
7
8impl SqlFunction for StripPunctuation {
9    fn signature(&self) -> FunctionSignature {
10        FunctionSignature {
11            name: "STRIP_PUNCTUATION",
12            category: FunctionCategory::String,
13            arg_count: ArgCount::Range(1, 2),
14            description: "Remove punctuation from text, optionally replacing with a character",
15            returns: "String with punctuation removed",
16            examples: vec![
17                "SELECT STRIP_PUNCTUATION('Hello, world!')           -- Returns 'Hello world'",
18                "SELECT STRIP_PUNCTUATION('foo.bar@baz', ' ')        -- Returns 'foo bar baz'",
19                "SELECT STRIP_PUNCTUATION('data-file_v2.0', '')      -- Returns 'datafilev20'",
20            ],
21        }
22    }
23
24    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
25        if args.is_empty() || args.len() > 2 {
26            return Ok(DataValue::Null);
27        }
28
29        let text = match &args[0] {
30            DataValue::String(s) => s.clone(),
31            DataValue::InternedString(s) => s.to_string(),
32            DataValue::Null => return Ok(DataValue::Null),
33            _ => return Ok(DataValue::Null),
34        };
35
36        let replacement = if args.len() == 2 {
37            match &args[1] {
38                DataValue::String(s) => s.clone(),
39                DataValue::InternedString(s) => s.to_string(),
40                _ => " ".to_string(),
41            }
42        } else {
43            " ".to_string()
44        };
45
46        // Replace all punctuation with the replacement character
47        let result = text
48            .chars()
49            .map(|c| {
50                if c.is_ascii_punctuation() || !c.is_alphanumeric() && !c.is_whitespace() {
51                    replacement.chars().next().unwrap_or(' ')
52                } else {
53                    c
54                }
55            })
56            .collect::<String>();
57
58        // Clean up multiple spaces if replacement was space
59        let result = if replacement == " " {
60            result.split_whitespace().collect::<Vec<_>>().join(" ")
61        } else {
62            result
63        };
64
65        Ok(DataValue::String(result))
66    }
67}
68
69/// Tokenize text into words
70pub struct Tokenize;
71
72impl SqlFunction for Tokenize {
73    fn signature(&self) -> FunctionSignature {
74        FunctionSignature {
75            name: "TOKENIZE",
76            category: FunctionCategory::String,
77            arg_count: ArgCount::Range(1, 2),
78            description: "Split text into words, removing punctuation and optionally converting case",
79            returns: "Space-separated words",
80            examples: vec![
81                "SELECT TOKENIZE('Hello, world!')                    -- Returns 'Hello world'",
82                "SELECT TOKENIZE('The quick brown fox.', 'lower')    -- Returns 'the quick brown fox'",
83                "SELECT TOKENIZE('foo-bar_baz', 'upper')             -- Returns 'FOO BAR BAZ'",
84            ],
85        }
86    }
87
88    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
89        if args.is_empty() || args.len() > 2 {
90            return Ok(DataValue::Null);
91        }
92
93        let text = match &args[0] {
94            DataValue::String(s) => s.clone(),
95            DataValue::InternedString(s) => s.to_string(),
96            DataValue::Null => return Ok(DataValue::Null),
97            _ => return Ok(DataValue::Null),
98        };
99
100        let case_option = if args.len() == 2 {
101            match &args[1] {
102                DataValue::String(s) => s.to_lowercase(),
103                DataValue::InternedString(s) => s.to_lowercase(),
104                _ => String::new(),
105            }
106        } else {
107            String::new()
108        };
109
110        // Split on any non-alphanumeric character
111        let words: Vec<String> = text
112            .split(|c: char| !c.is_alphanumeric())
113            .filter(|s| !s.is_empty())
114            .map(|s| match case_option.as_str() {
115                "lower" | "lowercase" => s.to_lowercase(),
116                "upper" | "uppercase" => s.to_uppercase(),
117                _ => s.to_string(),
118            })
119            .collect();
120
121        Ok(DataValue::String(words.join(" ")))
122    }
123}
124
125/// Clean text by removing extra whitespace and normalizing
126pub struct CleanText;
127
128impl SqlFunction for CleanText {
129    fn signature(&self) -> FunctionSignature {
130        FunctionSignature {
131            name: "CLEAN_TEXT",
132            category: FunctionCategory::String,
133            arg_count: ArgCount::Fixed(1),
134            description: "Clean text by normalizing whitespace and removing control characters",
135            returns: "Cleaned string",
136            examples: vec![
137                "SELECT CLEAN_TEXT('  hello   world  ')              -- Returns 'hello world'",
138                "SELECT CLEAN_TEXT('line1\\nline2\\tline3')            -- Returns 'line1 line2 line3'",
139            ],
140        }
141    }
142
143    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
144        if args.len() != 1 {
145            return Ok(DataValue::Null);
146        }
147
148        let text = match &args[0] {
149            DataValue::String(s) => s.clone(),
150            DataValue::InternedString(s) => s.to_string(),
151            DataValue::Null => return Ok(DataValue::Null),
152            _ => return Ok(DataValue::Null),
153        };
154
155        // Replace all whitespace (including newlines, tabs) with single spaces
156        // Remove control characters
157        let cleaned: String = text
158            .chars()
159            .filter(|c| !c.is_control() || c.is_whitespace())
160            .map(|c| if c.is_whitespace() { ' ' } else { c })
161            .collect();
162
163        // Collapse multiple spaces and trim
164        let result = cleaned.split_whitespace().collect::<Vec<_>>().join(" ");
165
166        Ok(DataValue::String(result))
167    }
168}
169
170/// Extract words from text (returns as a single string, use with SPLIT for table)
171pub struct ExtractWords;
172
173impl SqlFunction for ExtractWords {
174    fn signature(&self) -> FunctionSignature {
175        FunctionSignature {
176            name: "EXTRACT_WORDS",
177            category: FunctionCategory::String,
178            arg_count: ArgCount::Range(1, 3),
179            description: "Extract words from text, with optional min length and case conversion",
180            returns: "Comma-separated list of words",
181            examples: vec![
182                "SELECT EXTRACT_WORDS('Hello, world!')               -- Returns 'Hello,world'",
183                "SELECT EXTRACT_WORDS('The quick fox', 4)            -- Returns 'quick'",
184                "SELECT EXTRACT_WORDS('The Fox', 2, 'lower')         -- Returns 'the,fox'",
185            ],
186        }
187    }
188
189    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
190        if args.is_empty() || args.len() > 3 {
191            return Ok(DataValue::Null);
192        }
193
194        let text = match &args[0] {
195            DataValue::String(s) => s.clone(),
196            DataValue::InternedString(s) => s.to_string(),
197            DataValue::Null => return Ok(DataValue::Null),
198            _ => return Ok(DataValue::Null),
199        };
200
201        let min_length = if args.len() >= 2 {
202            match &args[1] {
203                DataValue::Integer(n) => *n as usize,
204                DataValue::Float(n) => *n as usize,
205                _ => 1,
206            }
207        } else {
208            1
209        };
210
211        let case_option = if args.len() == 3 {
212            match &args[2] {
213                DataValue::String(s) => s.to_lowercase(),
214                DataValue::InternedString(s) => s.to_lowercase(),
215                _ => String::new(),
216            }
217        } else {
218            String::new()
219        };
220
221        // Extract words
222        let words: Vec<String> = text
223            .split(|c: char| !c.is_alphanumeric())
224            .filter(|s| !s.is_empty() && s.len() >= min_length)
225            .map(|s| match case_option.as_str() {
226                "lower" | "lowercase" => s.to_lowercase(),
227                "upper" | "uppercase" => s.to_uppercase(),
228                _ => s.to_string(),
229            })
230            .collect();
231
232        Ok(DataValue::String(words.join(",")))
233    }
234}
235
236/// Count words in text
237pub struct WordCount;
238
239impl SqlFunction for WordCount {
240    fn signature(&self) -> FunctionSignature {
241        FunctionSignature {
242            name: "WORD_COUNT",
243            category: FunctionCategory::String,
244            arg_count: ArgCount::Fixed(1),
245            description: "Count the number of words in text",
246            returns: "Integer count of words",
247            examples: vec![
248                "SELECT WORD_COUNT('Hello world')                    -- Returns 2",
249                "SELECT WORD_COUNT('The quick brown fox')            -- Returns 4",
250                "SELECT WORD_COUNT('one,two;three')                  -- Returns 3",
251            ],
252        }
253    }
254
255    fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
256        if args.len() != 1 {
257            return Ok(DataValue::Null);
258        }
259
260        let text = match &args[0] {
261            DataValue::String(s) => s.clone(),
262            DataValue::InternedString(s) => s.to_string(),
263            DataValue::Null => return Ok(DataValue::Null),
264            _ => return Ok(DataValue::Null),
265        };
266
267        let count = text
268            .split(|c: char| !c.is_alphanumeric())
269            .filter(|s| !s.is_empty())
270            .count();
271
272        Ok(DataValue::Integer(count as i64))
273    }
274}
275
276#[cfg(test)]
277mod tests {
278    use super::*;
279
280    #[test]
281    fn test_strip_punctuation() {
282        let func = StripPunctuation;
283
284        assert_eq!(
285            func.evaluate(&[DataValue::String("Hello, world!".to_string())])
286                .unwrap(),
287            DataValue::String("Hello world".to_string())
288        );
289
290        assert_eq!(
291            func.evaluate(&[
292                DataValue::String("foo.bar@baz".to_string()),
293                DataValue::String("_".to_string())
294            ])
295            .unwrap(),
296            DataValue::String("foo_bar_baz".to_string())
297        );
298    }
299
300    #[test]
301    fn test_tokenize() {
302        let func = Tokenize;
303
304        assert_eq!(
305            func.evaluate(&[
306                DataValue::String("The quick brown fox.".to_string()),
307                DataValue::String("lower".to_string())
308            ])
309            .unwrap(),
310            DataValue::String("the quick brown fox".to_string())
311        );
312    }
313
314    #[test]
315    fn test_word_count() {
316        let func = WordCount;
317
318        assert_eq!(
319            func.evaluate(&[DataValue::String("Hello world".to_string())])
320                .unwrap(),
321            DataValue::Integer(2)
322        );
323
324        assert_eq!(
325            func.evaluate(&[DataValue::String("one,two;three".to_string())])
326                .unwrap(),
327            DataValue::Integer(3)
328        );
329    }
330
331    #[test]
332    fn test_extract_words() {
333        let func = ExtractWords;
334
335        assert_eq!(
336            func.evaluate(&[
337                DataValue::String("The quick brown fox".to_string()),
338                DataValue::Integer(4)
339            ])
340            .unwrap(),
341            DataValue::String("quick,brown".to_string())
342        );
343    }
344}