dsq_functions/builtin/
split.rs

1use dsq_shared::value::Value;
2use dsq_shared::Result;
3use inventory;
4use polars::prelude::*;
5use serde_json;
6use std::borrow::Cow;
7
8pub fn builtin_split(args: &[Value]) -> Result<Value> {
9    if args.is_empty() || args.len() > 2 {
10        return Err(dsq_shared::error::operation_error(
11            "split() expects 1 or 2 arguments",
12        ));
13    }
14
15    let separator = if args.len() == 2 {
16        match &args[1] {
17            Value::String(s) => s.clone(),
18            _ => {
19                return Err(dsq_shared::error::operation_error(
20                    "split() separator must be a string",
21                ));
22            }
23        }
24    } else {
25        " ".to_string() // default separator is whitespace
26    };
27
28    match &args[0] {
29        Value::String(s) => {
30            let parts: Vec<Value> = if separator.is_empty() {
31                s.chars().map(|c| Value::String(c.to_string())).collect()
32            } else {
33                s.split(&separator)
34                    .map(|part| Value::String(part.to_string()))
35                    .collect()
36            };
37            Ok(Value::Array(parts))
38        }
39        Value::Array(arr) => {
40            let split_arrays: Result<Vec<Value>> = arr
41                .iter()
42                .map(|v| match v {
43                    Value::String(s) => {
44                        let parts: Vec<Value> = if separator.is_empty() {
45                            s.chars().map(|c| Value::String(c.to_string())).collect()
46                        } else {
47                            s.split(&separator)
48                                .map(|part| Value::String(part.to_string()))
49                                .collect()
50                        };
51                        Ok(Value::Array(parts))
52                    }
53                    _ => Err(dsq_shared::error::operation_error(
54                        "split() requires string elements in array",
55                    )),
56                })
57                .collect();
58            Ok(Value::Array(split_arrays?))
59        }
60        Value::DataFrame(df) => {
61            let mut new_series = Vec::new();
62            for col_name in df.get_column_names() {
63                if let Ok(series) = df.column(col_name) {
64                    if series.dtype() == &DataType::String {
65                        let split_series = series
66                            .str()
67                            .unwrap()
68                            .apply(|s| {
69                                s.map(|s| {
70                                    let parts: Vec<String> = if separator.is_empty() {
71                                        s.chars().map(|c| c.to_string()).collect()
72                                    } else {
73                                        s.split(&separator).map(|part| part.to_string()).collect()
74                                    };
75                                    Cow::Owned(
76                                        serde_json::to_string(&Value::Array(
77                                            parts.into_iter().map(Value::String).collect(),
78                                        ))
79                                        .unwrap_or("null".to_string()),
80                                    )
81                                })
82                            })
83                            .into_series();
84                        let mut s = split_series;
85                        s.rename(col_name.clone());
86                        new_series.push(s.into());
87                    } else {
88                        let mut s = series.clone();
89                        s.rename(col_name.clone());
90                        new_series.push(s);
91                    }
92                }
93            }
94            match DataFrame::new(new_series) {
95                Ok(new_df) => Ok(Value::DataFrame(new_df)),
96                Err(e) => Err(dsq_shared::error::operation_error(format!(
97                    "split() failed on DataFrame: {}",
98                    e
99                ))),
100            }
101        }
102        Value::Series(series) => {
103            if series.dtype() == &DataType::String {
104                let split_series = series
105                    .str()
106                    .unwrap()
107                    .apply(|s| {
108                        s.map(|s| {
109                            let parts: Vec<String> = if separator.is_empty() {
110                                s.chars().map(|c| c.to_string()).collect()
111                            } else {
112                                s.split(&separator).map(|part| part.to_string()).collect()
113                            };
114                            Cow::Owned(
115                                serde_json::to_string(&Value::Array(
116                                    parts.into_iter().map(Value::String).collect(),
117                                ))
118                                .unwrap_or("null".to_string()),
119                            )
120                        })
121                    })
122                    .into_series();
123                Ok(Value::Series(split_series))
124            } else {
125                Ok(Value::Series(series.clone()))
126            }
127        }
128        _ => Err(dsq_shared::error::operation_error(
129            "split() requires string, array, DataFrame, or Series",
130        )),
131    }
132}
133
134inventory::submit! {
135    crate::FunctionRegistration {
136        name: "split",
137        func: builtin_split,
138    }
139}
140
141#[cfg(test)]
142mod tests {
143    use super::*;
144    use dsq_shared::value::Value;
145
146    #[test]
147    fn test_split_string_default_separator() {
148        let result = builtin_split(&[Value::String("hello world".to_string())]).unwrap();
149        match result {
150            Value::Array(parts) => {
151                assert_eq!(parts.len(), 2);
152                assert_eq!(parts[0], Value::String("hello".to_string()));
153                assert_eq!(parts[1], Value::String("world".to_string()));
154            }
155            _ => panic!("Expected Array"),
156        }
157    }
158
159    #[test]
160    fn test_split_string_custom_separator() {
161        let result = builtin_split(&[
162            Value::String("a,b,c".to_string()),
163            Value::String(",".to_string()),
164        ])
165        .unwrap();
166        match result {
167            Value::Array(parts) => {
168                assert_eq!(parts.len(), 3);
169                assert_eq!(parts[0], Value::String("a".to_string()));
170                assert_eq!(parts[1], Value::String("b".to_string()));
171                assert_eq!(parts[2], Value::String("c".to_string()));
172            }
173            _ => panic!("Expected Array"),
174        }
175    }
176
177    #[test]
178    fn test_split_string_empty_separator() {
179        let result = builtin_split(&[
180            Value::String("abc".to_string()),
181            Value::String("".to_string()),
182        ])
183        .unwrap();
184        match result {
185            Value::Array(parts) => {
186                assert_eq!(parts.len(), 3);
187                assert_eq!(parts[0], Value::String("a".to_string()));
188                assert_eq!(parts[1], Value::String("b".to_string()));
189                assert_eq!(parts[2], Value::String("c".to_string()));
190            }
191            _ => panic!("Expected Array"),
192        }
193    }
194
195    #[test]
196    fn test_split_array() {
197        let arr = Value::Array(vec![
198            Value::String("a b".to_string()),
199            Value::String("c d".to_string()),
200        ]);
201        let result = builtin_split(&[arr]).unwrap();
202        match result {
203            Value::Array(arrays) => {
204                assert_eq!(arrays.len(), 2);
205                if let Value::Array(first) = &arrays[0] {
206                    assert_eq!(first.len(), 2);
207                    assert_eq!(first[0], Value::String("a".to_string()));
208                    assert_eq!(first[1], Value::String("b".to_string()));
209                } else {
210                    panic!("Expected nested Array");
211                }
212                if let Value::Array(second) = &arrays[1] {
213                    assert_eq!(second.len(), 2);
214                    assert_eq!(second[0], Value::String("c".to_string()));
215                    assert_eq!(second[1], Value::String("d".to_string()));
216                } else {
217                    panic!("Expected nested Array");
218                }
219            }
220            _ => panic!("Expected Array"),
221        }
222    }
223
224    #[test]
225    fn test_split_wrong_args() {
226        let result = builtin_split(&[]);
227        assert!(result.is_err());
228        let result = builtin_split(&[Value::Int(1), Value::String(",".to_string()), Value::Int(2)]);
229        assert!(result.is_err());
230    }
231
232    #[test]
233    fn test_split_non_string_in_array() {
234        let arr = Value::Array(vec![Value::Int(1)]);
235        let result = builtin_split(&[arr]);
236        assert!(result.is_err());
237    }
238
239    #[test]
240    fn test_split_registered_via_inventory() {
241        let mut found = false;
242        for func in inventory::iter::<crate::FunctionRegistration> {
243            if func.name == "split" {
244                found = true;
245                // Test that the function works
246                let result = (func.func)(&[Value::String("test split".to_string())]).unwrap();
247                match result {
248                    Value::Array(parts) => {
249                        assert_eq!(parts.len(), 2);
250                        assert_eq!(parts[0], Value::String("test".to_string()));
251                        assert_eq!(parts[1], Value::String("split".to_string()));
252                    }
253                    _ => panic!("Expected Array"),
254                }
255                break;
256            }
257        }
258        assert!(found, "split function not found in inventory");
259    }
260}