1use crate::data::datatable::DataValue;
2use crate::sql::functions::{ArgCount, FunctionCategory, FunctionSignature, SqlFunction};
3use anyhow::Result;
4
5pub struct StripPunctuation;
7
8impl SqlFunction for StripPunctuation {
9 fn signature(&self) -> FunctionSignature {
10 FunctionSignature {
11 name: "STRIP_PUNCTUATION",
12 category: FunctionCategory::String,
13 arg_count: ArgCount::Range(1, 2),
14 description: "Remove punctuation from text, optionally replacing with a character",
15 returns: "String with punctuation removed",
16 examples: vec![
17 "SELECT STRIP_PUNCTUATION('Hello, world!') -- Returns 'Hello world'",
18 "SELECT STRIP_PUNCTUATION('foo.bar@baz', ' ') -- Returns 'foo bar baz'",
19 "SELECT STRIP_PUNCTUATION('data-file_v2.0', '') -- Returns 'datafilev20'",
20 ],
21 }
22 }
23
24 fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
25 if args.is_empty() || args.len() > 2 {
26 return Ok(DataValue::Null);
27 }
28
29 let text = match &args[0] {
30 DataValue::String(s) => s.clone(),
31 DataValue::InternedString(s) => s.to_string(),
32 DataValue::Null => return Ok(DataValue::Null),
33 _ => return Ok(DataValue::Null),
34 };
35
36 let replacement = if args.len() == 2 {
37 match &args[1] {
38 DataValue::String(s) => s.clone(),
39 DataValue::InternedString(s) => s.to_string(),
40 _ => " ".to_string(),
41 }
42 } else {
43 " ".to_string()
44 };
45
46 let result = text
48 .chars()
49 .map(|c| {
50 if c.is_ascii_punctuation() || !c.is_alphanumeric() && !c.is_whitespace() {
51 replacement.chars().next().unwrap_or(' ')
52 } else {
53 c
54 }
55 })
56 .collect::<String>();
57
58 let result = if replacement == " " {
60 result.split_whitespace().collect::<Vec<_>>().join(" ")
61 } else {
62 result
63 };
64
65 Ok(DataValue::String(result))
66 }
67}
68
69pub struct Tokenize;
71
72impl SqlFunction for Tokenize {
73 fn signature(&self) -> FunctionSignature {
74 FunctionSignature {
75 name: "TOKENIZE",
76 category: FunctionCategory::String,
77 arg_count: ArgCount::Range(1, 2),
78 description: "Split text into words, removing punctuation and optionally converting case",
79 returns: "Space-separated words",
80 examples: vec![
81 "SELECT TOKENIZE('Hello, world!') -- Returns 'Hello world'",
82 "SELECT TOKENIZE('The quick brown fox.', 'lower') -- Returns 'the quick brown fox'",
83 "SELECT TOKENIZE('foo-bar_baz', 'upper') -- Returns 'FOO BAR BAZ'",
84 ],
85 }
86 }
87
88 fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
89 if args.is_empty() || args.len() > 2 {
90 return Ok(DataValue::Null);
91 }
92
93 let text = match &args[0] {
94 DataValue::String(s) => s.clone(),
95 DataValue::InternedString(s) => s.to_string(),
96 DataValue::Null => return Ok(DataValue::Null),
97 _ => return Ok(DataValue::Null),
98 };
99
100 let case_option = if args.len() == 2 {
101 match &args[1] {
102 DataValue::String(s) => s.to_lowercase(),
103 DataValue::InternedString(s) => s.to_lowercase(),
104 _ => String::new(),
105 }
106 } else {
107 String::new()
108 };
109
110 let words: Vec<String> = text
112 .split(|c: char| !c.is_alphanumeric())
113 .filter(|s| !s.is_empty())
114 .map(|s| match case_option.as_str() {
115 "lower" | "lowercase" => s.to_lowercase(),
116 "upper" | "uppercase" => s.to_uppercase(),
117 _ => s.to_string(),
118 })
119 .collect();
120
121 Ok(DataValue::String(words.join(" ")))
122 }
123}
124
125pub struct CleanText;
127
128impl SqlFunction for CleanText {
129 fn signature(&self) -> FunctionSignature {
130 FunctionSignature {
131 name: "CLEAN_TEXT",
132 category: FunctionCategory::String,
133 arg_count: ArgCount::Fixed(1),
134 description: "Clean text by normalizing whitespace and removing control characters",
135 returns: "Cleaned string",
136 examples: vec![
137 "SELECT CLEAN_TEXT(' hello world ') -- Returns 'hello world'",
138 "SELECT CLEAN_TEXT('line1\\nline2\\tline3') -- Returns 'line1 line2 line3'",
139 ],
140 }
141 }
142
143 fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
144 if args.len() != 1 {
145 return Ok(DataValue::Null);
146 }
147
148 let text = match &args[0] {
149 DataValue::String(s) => s.clone(),
150 DataValue::InternedString(s) => s.to_string(),
151 DataValue::Null => return Ok(DataValue::Null),
152 _ => return Ok(DataValue::Null),
153 };
154
155 let cleaned: String = text
158 .chars()
159 .filter(|c| !c.is_control() || c.is_whitespace())
160 .map(|c| if c.is_whitespace() { ' ' } else { c })
161 .collect();
162
163 let result = cleaned.split_whitespace().collect::<Vec<_>>().join(" ");
165
166 Ok(DataValue::String(result))
167 }
168}
169
170pub struct ExtractWords;
172
173impl SqlFunction for ExtractWords {
174 fn signature(&self) -> FunctionSignature {
175 FunctionSignature {
176 name: "EXTRACT_WORDS",
177 category: FunctionCategory::String,
178 arg_count: ArgCount::Range(1, 3),
179 description: "Extract words from text, with optional min length and case conversion",
180 returns: "Comma-separated list of words",
181 examples: vec![
182 "SELECT EXTRACT_WORDS('Hello, world!') -- Returns 'Hello,world'",
183 "SELECT EXTRACT_WORDS('The quick fox', 4) -- Returns 'quick'",
184 "SELECT EXTRACT_WORDS('The Fox', 2, 'lower') -- Returns 'the,fox'",
185 ],
186 }
187 }
188
189 fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
190 if args.is_empty() || args.len() > 3 {
191 return Ok(DataValue::Null);
192 }
193
194 let text = match &args[0] {
195 DataValue::String(s) => s.clone(),
196 DataValue::InternedString(s) => s.to_string(),
197 DataValue::Null => return Ok(DataValue::Null),
198 _ => return Ok(DataValue::Null),
199 };
200
201 let min_length = if args.len() >= 2 {
202 match &args[1] {
203 DataValue::Integer(n) => *n as usize,
204 DataValue::Float(n) => *n as usize,
205 _ => 1,
206 }
207 } else {
208 1
209 };
210
211 let case_option = if args.len() == 3 {
212 match &args[2] {
213 DataValue::String(s) => s.to_lowercase(),
214 DataValue::InternedString(s) => s.to_lowercase(),
215 _ => String::new(),
216 }
217 } else {
218 String::new()
219 };
220
221 let words: Vec<String> = text
223 .split(|c: char| !c.is_alphanumeric())
224 .filter(|s| !s.is_empty() && s.len() >= min_length)
225 .map(|s| match case_option.as_str() {
226 "lower" | "lowercase" => s.to_lowercase(),
227 "upper" | "uppercase" => s.to_uppercase(),
228 _ => s.to_string(),
229 })
230 .collect();
231
232 Ok(DataValue::String(words.join(",")))
233 }
234}
235
236pub struct WordCount;
238
239impl SqlFunction for WordCount {
240 fn signature(&self) -> FunctionSignature {
241 FunctionSignature {
242 name: "WORD_COUNT",
243 category: FunctionCategory::String,
244 arg_count: ArgCount::Fixed(1),
245 description: "Count the number of words in text",
246 returns: "Integer count of words",
247 examples: vec![
248 "SELECT WORD_COUNT('Hello world') -- Returns 2",
249 "SELECT WORD_COUNT('The quick brown fox') -- Returns 4",
250 "SELECT WORD_COUNT('one,two;three') -- Returns 3",
251 ],
252 }
253 }
254
255 fn evaluate(&self, args: &[DataValue]) -> Result<DataValue> {
256 if args.len() != 1 {
257 return Ok(DataValue::Null);
258 }
259
260 let text = match &args[0] {
261 DataValue::String(s) => s.clone(),
262 DataValue::InternedString(s) => s.to_string(),
263 DataValue::Null => return Ok(DataValue::Null),
264 _ => return Ok(DataValue::Null),
265 };
266
267 let count = text
268 .split(|c: char| !c.is_alphanumeric())
269 .filter(|s| !s.is_empty())
270 .count();
271
272 Ok(DataValue::Integer(count as i64))
273 }
274}
275
276#[cfg(test)]
277mod tests {
278 use super::*;
279
280 #[test]
281 fn test_strip_punctuation() {
282 let func = StripPunctuation;
283
284 assert_eq!(
285 func.evaluate(&[DataValue::String("Hello, world!".to_string())])
286 .unwrap(),
287 DataValue::String("Hello world".to_string())
288 );
289
290 assert_eq!(
291 func.evaluate(&[
292 DataValue::String("foo.bar@baz".to_string()),
293 DataValue::String("_".to_string())
294 ])
295 .unwrap(),
296 DataValue::String("foo_bar_baz".to_string())
297 );
298 }
299
300 #[test]
301 fn test_tokenize() {
302 let func = Tokenize;
303
304 assert_eq!(
305 func.evaluate(&[
306 DataValue::String("The quick brown fox.".to_string()),
307 DataValue::String("lower".to_string())
308 ])
309 .unwrap(),
310 DataValue::String("the quick brown fox".to_string())
311 );
312 }
313
314 #[test]
315 fn test_word_count() {
316 let func = WordCount;
317
318 assert_eq!(
319 func.evaluate(&[DataValue::String("Hello world".to_string())])
320 .unwrap(),
321 DataValue::Integer(2)
322 );
323
324 assert_eq!(
325 func.evaluate(&[DataValue::String("one,two;three".to_string())])
326 .unwrap(),
327 DataValue::Integer(3)
328 );
329 }
330
331 #[test]
332 fn test_extract_words() {
333 let func = ExtractWords;
334
335 assert_eq!(
336 func.evaluate(&[
337 DataValue::String("The quick brown fox".to_string()),
338 DataValue::Integer(4)
339 ])
340 .unwrap(),
341 DataValue::String("quick,brown".to_string())
342 );
343 }
344}