Skip to main content

runmat_runtime/builtins/strings/transform/
split.rs

1//! MATLAB-compatible `split` and `strsplit` builtins with GPU-aware semantics for RunMat.
2
3use std::collections::HashSet;
4
5use regex::RegexBuilder;
6use runmat_builtins::{
7    BuiltinCompletionPolicy, BuiltinDescriptor, BuiltinErrorDescriptor, BuiltinOutputMode,
8    BuiltinParamArity, BuiltinParamDescriptor, BuiltinParamType, BuiltinSignatureDescriptor,
9    CellArray, CharArray, StringArray, Value,
10};
11use runmat_macros::runtime_builtin;
12
13use crate::builtins::common::map_control_flow_with_builtin;
14use crate::builtins::common::spec::{
15    BroadcastSemantics, BuiltinFusionSpec, BuiltinGpuSpec, ConstantStrategy, GpuOpKind,
16    ReductionNaN, ResidencyPolicy, ShapeRequirements,
17};
18use crate::builtins::strings::common::{char_row_to_string_slice, is_missing_string};
19use crate::builtins::strings::type_resolvers::{string_array_type, unknown_type};
20use crate::{build_runtime_error, gather_if_needed_async, make_cell, BuiltinResult, RuntimeError};
21
22#[runmat_macros::register_gpu_spec(builtin_path = "crate::builtins::strings::transform::split")]
23pub const GPU_SPEC: BuiltinGpuSpec = BuiltinGpuSpec {
24    name: "split",
25    op_kind: GpuOpKind::Custom("string-transform"),
26    supported_precisions: &[],
27    broadcast: BroadcastSemantics::None,
28    provider_hooks: &[],
29    constant_strategy: ConstantStrategy::InlineLiteral,
30    residency: ResidencyPolicy::GatherImmediately,
31    nan_mode: ReductionNaN::Include,
32    two_pass_threshold: None,
33    workgroup_size: None,
34    accepts_nan_mode: false,
35    notes: "Executes on the CPU; GPU-resident inputs are gathered to host memory before splitting.",
36};
37
38#[runmat_macros::register_fusion_spec(builtin_path = "crate::builtins::strings::transform::split")]
39pub const FUSION_SPEC: BuiltinFusionSpec = BuiltinFusionSpec {
40    name: "split",
41    shape: ShapeRequirements::Any,
42    constant_strategy: ConstantStrategy::InlineLiteral,
43    elementwise: None,
44    reduction: None,
45    emits_nan: false,
46    notes: "String transformation builtin; not eligible for fusion planning and always gathers GPU inputs.",
47};
48
49const BUILTIN_NAME: &str = "split";
50const STRSPLIT_BUILTIN_NAME: &str = "strsplit";
51
52const SPLIT_OUTPUT: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
53    name: "newStr",
54    ty: BuiltinParamType::Any,
55    arity: BuiltinParamArity::Required,
56    default: None,
57    description: "String array containing split tokens.",
58}];
59
60const SPLIT_INPUTS_BASE: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
61    name: "str",
62    ty: BuiltinParamType::Any,
63    arity: BuiltinParamArity::Required,
64    default: None,
65    description: "Input text scalar/array/cell to split.",
66}];
67
68const SPLIT_INPUTS_DELIMITER: [BuiltinParamDescriptor; 2] = [
69    BuiltinParamDescriptor {
70        name: "str",
71        ty: BuiltinParamType::Any,
72        arity: BuiltinParamArity::Required,
73        default: None,
74        description: "Input text scalar/array/cell to split.",
75    },
76    BuiltinParamDescriptor {
77        name: "delimiter",
78        ty: BuiltinParamType::Any,
79        arity: BuiltinParamArity::Required,
80        default: None,
81        description: "Delimiter scalar/array/cell.",
82    },
83];
84
85const SPLIT_INPUTS_DELIMITER_NAMEVALUE: [BuiltinParamDescriptor; 4] = [
86    BuiltinParamDescriptor {
87        name: "str",
88        ty: BuiltinParamType::Any,
89        arity: BuiltinParamArity::Required,
90        default: None,
91        description: "Input text scalar/array/cell to split.",
92    },
93    BuiltinParamDescriptor {
94        name: "delimiter",
95        ty: BuiltinParamType::Any,
96        arity: BuiltinParamArity::Required,
97        default: None,
98        description: "Delimiter scalar/array/cell.",
99    },
100    BuiltinParamDescriptor {
101        name: "Name",
102        ty: BuiltinParamType::StringScalar,
103        arity: BuiltinParamArity::Required,
104        default: None,
105        description: "Option name (`CollapseDelimiters` or `IncludeDelimiters`).",
106    },
107    BuiltinParamDescriptor {
108        name: "Value",
109        ty: BuiltinParamType::Any,
110        arity: BuiltinParamArity::Variadic,
111        default: None,
112        description: "Option values and additional Name/Value pairs.",
113    },
114];
115
116const SPLIT_INPUTS_NAMEVALUE: [BuiltinParamDescriptor; 3] = [
117    BuiltinParamDescriptor {
118        name: "str",
119        ty: BuiltinParamType::Any,
120        arity: BuiltinParamArity::Required,
121        default: None,
122        description: "Input text scalar/array/cell to split.",
123    },
124    BuiltinParamDescriptor {
125        name: "Name",
126        ty: BuiltinParamType::StringScalar,
127        arity: BuiltinParamArity::Required,
128        default: None,
129        description: "Option name (`CollapseDelimiters` or `IncludeDelimiters`).",
130    },
131    BuiltinParamDescriptor {
132        name: "Value",
133        ty: BuiltinParamType::Any,
134        arity: BuiltinParamArity::Variadic,
135        default: None,
136        description: "Option values and additional Name/Value pairs.",
137    },
138];
139
140const SPLIT_SIGNATURES: [BuiltinSignatureDescriptor; 4] = [
141    BuiltinSignatureDescriptor {
142        label: "newStr = split(str)",
143        inputs: &SPLIT_INPUTS_BASE,
144        outputs: &SPLIT_OUTPUT,
145    },
146    BuiltinSignatureDescriptor {
147        label: "newStr = split(str, delimiter)",
148        inputs: &SPLIT_INPUTS_DELIMITER,
149        outputs: &SPLIT_OUTPUT,
150    },
151    BuiltinSignatureDescriptor {
152        label: "newStr = split(str, delimiter, Name, Value, ...)",
153        inputs: &SPLIT_INPUTS_DELIMITER_NAMEVALUE,
154        outputs: &SPLIT_OUTPUT,
155    },
156    BuiltinSignatureDescriptor {
157        label: "newStr = split(str, Name, Value, ...)",
158        inputs: &SPLIT_INPUTS_NAMEVALUE,
159        outputs: &SPLIT_OUTPUT,
160    },
161];
162
163const SPLIT_ERROR_INVALID_INPUT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
164    code: "RM.SPLIT.INVALID_INPUT",
165    identifier: Some("RunMat:split:InvalidInput"),
166    when: "First argument is not a string scalar/array, char array, or cell array of text scalars.",
167    message:
168        "split: first argument must be a string scalar, string array, character array, or cell array of character vectors",
169};
170
171const SPLIT_ERROR_DELIMITER_TYPE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
172    code: "RM.SPLIT.DELIMITER_TYPE",
173    identifier: Some("RunMat:split:DelimiterType"),
174    when: "Delimiter input is not a supported text scalar/array/cell.",
175    message:
176        "split: delimiter input must be a string scalar, string array, character array, or cell array of character vectors",
177};
178
179const SPLIT_ERROR_NAME_VALUE_PAIR: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
180    code: "RM.SPLIT.NAME_VALUE_PAIR",
181    identifier: Some("RunMat:split:NameValuePair"),
182    when: "Name-value options are not supplied in complete pairs.",
183    message: "split: name-value arguments must be supplied in pairs",
184};
185
186const SPLIT_ERROR_UNKNOWN_NAME: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
187    code: "RM.SPLIT.UNKNOWN_NAME",
188    identifier: Some("RunMat:split:UnknownName"),
189    when: "An option name is not recognized.",
190    message:
191        "split: unrecognized name-value argument; supported names are 'CollapseDelimiters' and 'IncludeDelimiters'",
192};
193
194const SPLIT_ERROR_EMPTY_DELIMITER: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
195    code: "RM.SPLIT.EMPTY_DELIMITER",
196    identifier: Some("RunMat:split:EmptyDelimiter"),
197    when: "Delimiter list is empty or contains empty delimiter entries.",
198    message: "split: delimiters must contain at least one character",
199};
200
201const SPLIT_ERROR_CELL_ELEMENT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
202    code: "RM.SPLIT.CELL_ELEMENT",
203    identifier: Some("RunMat:split:CellElement"),
204    when: "Cell arrays contain non-text elements or non-row char arrays.",
205    message: "split: cell array elements must be string scalars or character vectors",
206};
207
208const SPLIT_ERROR_OPTION_VALUE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
209    code: "RM.SPLIT.OPTION_VALUE",
210    identifier: Some("RunMat:split:OptionValue"),
211    when: "Option values are not logical true/false values.",
212    message: "split: option values must be logical true or false",
213};
214
215const SPLIT_ERROR_INTERNAL: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
216    code: "RM.SPLIT.INTERNAL",
217    identifier: Some("RunMat:split:InternalError"),
218    when: "Internal output container construction failed.",
219    message: "split: internal error",
220};
221
222const SPLIT_ERRORS: [BuiltinErrorDescriptor; 8] = [
223    SPLIT_ERROR_INVALID_INPUT,
224    SPLIT_ERROR_DELIMITER_TYPE,
225    SPLIT_ERROR_NAME_VALUE_PAIR,
226    SPLIT_ERROR_UNKNOWN_NAME,
227    SPLIT_ERROR_EMPTY_DELIMITER,
228    SPLIT_ERROR_CELL_ELEMENT,
229    SPLIT_ERROR_OPTION_VALUE,
230    SPLIT_ERROR_INTERNAL,
231];
232
233pub const SPLIT_DESCRIPTOR: BuiltinDescriptor = BuiltinDescriptor {
234    signatures: &SPLIT_SIGNATURES,
235    output_mode: BuiltinOutputMode::Fixed,
236    completion_policy: BuiltinCompletionPolicy::Public,
237    errors: &SPLIT_ERRORS,
238};
239
240const STRSPLIT_OUTPUT: [BuiltinParamDescriptor; 2] = [
241    BuiltinParamDescriptor {
242        name: "parts",
243        ty: BuiltinParamType::Any,
244        arity: BuiltinParamArity::Required,
245        default: None,
246        description: "Split tokens.",
247    },
248    BuiltinParamDescriptor {
249        name: "matches",
250        ty: BuiltinParamType::Any,
251        arity: BuiltinParamArity::Optional,
252        default: None,
253        description: "Matched delimiters when requested as second output.",
254    },
255];
256
257const STRSPLIT_INPUTS_BASE: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
258    name: "str",
259    ty: BuiltinParamType::Any,
260    arity: BuiltinParamArity::Required,
261    default: None,
262    description: "String scalar or character vector input.",
263}];
264
265const STRSPLIT_INPUTS_DELIMITER: [BuiltinParamDescriptor; 2] = [
266    BuiltinParamDescriptor {
267        name: "str",
268        ty: BuiltinParamType::Any,
269        arity: BuiltinParamArity::Required,
270        default: None,
271        description: "String scalar or character vector input.",
272    },
273    BuiltinParamDescriptor {
274        name: "delimiter",
275        ty: BuiltinParamType::Any,
276        arity: BuiltinParamArity::Required,
277        default: None,
278        description: "Delimiter scalar/array/cell.",
279    },
280];
281
282const STRSPLIT_INPUTS_DELIMITER_NAMEVALUE: [BuiltinParamDescriptor; 4] = [
283    BuiltinParamDescriptor {
284        name: "str",
285        ty: BuiltinParamType::Any,
286        arity: BuiltinParamArity::Required,
287        default: None,
288        description: "String scalar or character vector input.",
289    },
290    BuiltinParamDescriptor {
291        name: "delimiter",
292        ty: BuiltinParamType::Any,
293        arity: BuiltinParamArity::Required,
294        default: None,
295        description: "Delimiter scalar/array/cell.",
296    },
297    BuiltinParamDescriptor {
298        name: "Name",
299        ty: BuiltinParamType::StringScalar,
300        arity: BuiltinParamArity::Required,
301        default: None,
302        description: "Option name (`CollapseDelimiters` or `DelimiterType`).",
303    },
304    BuiltinParamDescriptor {
305        name: "Value",
306        ty: BuiltinParamType::Any,
307        arity: BuiltinParamArity::Variadic,
308        default: None,
309        description: "Option values and additional Name/Value pairs.",
310    },
311];
312
313const STRSPLIT_INPUTS_NAMEVALUE: [BuiltinParamDescriptor; 3] = [
314    BuiltinParamDescriptor {
315        name: "str",
316        ty: BuiltinParamType::Any,
317        arity: BuiltinParamArity::Required,
318        default: None,
319        description: "String scalar or character vector input.",
320    },
321    BuiltinParamDescriptor {
322        name: "Name",
323        ty: BuiltinParamType::StringScalar,
324        arity: BuiltinParamArity::Required,
325        default: None,
326        description: "Option name (`CollapseDelimiters` or `DelimiterType`).",
327    },
328    BuiltinParamDescriptor {
329        name: "Value",
330        ty: BuiltinParamType::Any,
331        arity: BuiltinParamArity::Variadic,
332        default: None,
333        description: "Option values and additional Name/Value pairs.",
334    },
335];
336
337const STRSPLIT_SIGNATURES: [BuiltinSignatureDescriptor; 4] = [
338    BuiltinSignatureDescriptor {
339        label: "[parts, matches] = strsplit(str)",
340        inputs: &STRSPLIT_INPUTS_BASE,
341        outputs: &STRSPLIT_OUTPUT,
342    },
343    BuiltinSignatureDescriptor {
344        label: "[parts, matches] = strsplit(str, delimiter)",
345        inputs: &STRSPLIT_INPUTS_DELIMITER,
346        outputs: &STRSPLIT_OUTPUT,
347    },
348    BuiltinSignatureDescriptor {
349        label: "[parts, matches] = strsplit(str, delimiter, Name, Value, ...)",
350        inputs: &STRSPLIT_INPUTS_DELIMITER_NAMEVALUE,
351        outputs: &STRSPLIT_OUTPUT,
352    },
353    BuiltinSignatureDescriptor {
354        label: "[parts, matches] = strsplit(str, Name, Value, ...)",
355        inputs: &STRSPLIT_INPUTS_NAMEVALUE,
356        outputs: &STRSPLIT_OUTPUT,
357    },
358];
359
360const STRSPLIT_ERROR_INVALID_INPUT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
361    code: "RM.STRSPLIT.INVALID_INPUT",
362    identifier: Some("RunMat:strsplit:InvalidInput"),
363    when: "First argument is not a string scalar or character vector.",
364    message: "strsplit: first argument must be a string scalar or character vector",
365};
366
367const STRSPLIT_ERROR_DELIMITER_TYPE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
368    code: "RM.STRSPLIT.DELIMITER_TYPE",
369    identifier: Some("RunMat:strsplit:DelimiterType"),
370    when: "Delimiter input is not a supported text scalar/array/cell.",
371    message:
372        "strsplit: delimiter must be a character vector, string scalar, string array, or cell array of character vectors",
373};
374
375const STRSPLIT_ERROR_NAME_VALUE_PAIR: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
376    code: "RM.STRSPLIT.NAME_VALUE_PAIR",
377    identifier: Some("RunMat:strsplit:NameValuePair"),
378    when: "Name-value options are not supplied in complete pairs.",
379    message: "strsplit: name-value arguments must be supplied in pairs",
380};
381
382const STRSPLIT_ERROR_UNKNOWN_NAME: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
383    code: "RM.STRSPLIT.UNKNOWN_NAME",
384    identifier: Some("RunMat:strsplit:UnknownName"),
385    when: "An option name is not recognized.",
386    message:
387        "strsplit: unrecognized name-value argument; supported names are 'CollapseDelimiters' and 'DelimiterType'",
388};
389
390const STRSPLIT_ERROR_EMPTY_DELIMITER: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
391    code: "RM.STRSPLIT.EMPTY_DELIMITER",
392    identifier: Some("RunMat:strsplit:EmptyDelimiter"),
393    when: "Delimiter list is empty or contains empty delimiter entries.",
394    message: "strsplit: delimiters must contain at least one character",
395};
396
397const STRSPLIT_ERROR_DELIMITER_MODE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
398    code: "RM.STRSPLIT.DELIMITER_MODE",
399    identifier: Some("RunMat:strsplit:DelimiterMode"),
400    when: "DelimiterType option is not `Simple` or `RegularExpression`.",
401    message: "strsplit: value for 'DelimiterType' must be 'Simple' or 'RegularExpression'",
402};
403
404const STRSPLIT_ERROR_OPTION_VALUE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
405    code: "RM.STRSPLIT.OPTION_VALUE",
406    identifier: Some("RunMat:strsplit:OptionValue"),
407    when: "Option values are not logical true/false values.",
408    message: "strsplit: option values must be logical true or false",
409};
410
411const STRSPLIT_ERROR_REGEX_INVALID: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
412    code: "RM.STRSPLIT.REGEX_INVALID",
413    identifier: Some("RunMat:strsplit:RegexInvalid"),
414    when: "Regular expression delimiter pattern fails to compile.",
415    message: "strsplit: invalid delimiter regular expression",
416};
417
418const STRSPLIT_ERROR_INTERNAL: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
419    code: "RM.STRSPLIT.INTERNAL",
420    identifier: Some("RunMat:strsplit:InternalError"),
421    when: "Internal output container construction failed.",
422    message: "strsplit: internal error",
423};
424
425const STRSPLIT_ERRORS: [BuiltinErrorDescriptor; 9] = [
426    STRSPLIT_ERROR_INVALID_INPUT,
427    STRSPLIT_ERROR_DELIMITER_TYPE,
428    STRSPLIT_ERROR_NAME_VALUE_PAIR,
429    STRSPLIT_ERROR_UNKNOWN_NAME,
430    STRSPLIT_ERROR_EMPTY_DELIMITER,
431    STRSPLIT_ERROR_DELIMITER_MODE,
432    STRSPLIT_ERROR_OPTION_VALUE,
433    STRSPLIT_ERROR_REGEX_INVALID,
434    STRSPLIT_ERROR_INTERNAL,
435];
436
437pub const STRSPLIT_DESCRIPTOR: BuiltinDescriptor = BuiltinDescriptor {
438    signatures: &STRSPLIT_SIGNATURES,
439    output_mode: BuiltinOutputMode::ByRequestedOutputCount,
440    completion_policy: BuiltinCompletionPolicy::Public,
441    errors: &STRSPLIT_ERRORS,
442};
443
444fn map_flow(err: RuntimeError) -> RuntimeError {
445    map_control_flow_with_builtin(err, BUILTIN_NAME)
446}
447
448fn split_error_with_message(
449    message: impl Into<String>,
450    error: &'static BuiltinErrorDescriptor,
451) -> RuntimeError {
452    let mut builder = build_runtime_error(message).with_builtin(BUILTIN_NAME);
453    if let Some(identifier) = error.identifier {
454        builder = builder.with_identifier(identifier);
455    }
456    builder.build()
457}
458
459fn split_error(error: &'static BuiltinErrorDescriptor) -> RuntimeError {
460    split_error_with_message(error.message, error)
461}
462
463fn strsplit_error_with_message(
464    message: impl Into<String>,
465    error: &'static BuiltinErrorDescriptor,
466) -> RuntimeError {
467    let mut builder = build_runtime_error(message).with_builtin(STRSPLIT_BUILTIN_NAME);
468    if let Some(identifier) = error.identifier {
469        builder = builder.with_identifier(identifier);
470    }
471    builder.build()
472}
473
474fn strsplit_error(error: &'static BuiltinErrorDescriptor) -> RuntimeError {
475    strsplit_error_with_message(error.message, error)
476}
477
478#[runtime_builtin(
479    name = "split",
480    category = "strings/transform",
481    summary = "Split text inputs into substrings using delimiter rules.",
482    keywords = "split,strsplit,delimiter,CollapseDelimiters,IncludeDelimiters",
483    accel = "sink",
484    type_resolver(string_array_type),
485    descriptor(crate::builtins::strings::transform::split::SPLIT_DESCRIPTOR),
486    builtin_path = "crate::builtins::strings::transform::split"
487)]
488async fn split_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
489    let text = gather_if_needed_async(&text).await.map_err(map_flow)?;
490    let mut args: Vec<Value> = Vec::with_capacity(rest.len());
491    for arg in rest {
492        args.push(gather_if_needed_async(&arg).await.map_err(map_flow)?);
493    }
494
495    let options = SplitOptions::parse(&args)?;
496    let matrix = TextMatrix::from_value(text)?;
497    matrix.into_split_result(&options)
498}
499
500#[runtime_builtin(
501    name = "strsplit",
502    category = "strings/transform",
503    summary = "Split scalar text into substrings using simple or regex delimiters.",
504    keywords = "strsplit,split,delimiter,CollapseDelimiters,DelimiterType,matches",
505    accel = "sink",
506    type_resolver(unknown_type),
507    descriptor(crate::builtins::strings::transform::split::STRSPLIT_DESCRIPTOR),
508    builtin_path = "crate::builtins::strings::transform::split"
509)]
510async fn strsplit_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
511    let text = gather_if_needed_async(&text)
512        .await
513        .map_err(|err| map_control_flow_with_builtin(err, STRSPLIT_BUILTIN_NAME))?;
514    let mut args = Vec::with_capacity(rest.len());
515    for arg in rest {
516        args.push(
517            gather_if_needed_async(&arg)
518                .await
519                .map_err(|err| map_control_flow_with_builtin(err, STRSPLIT_BUILTIN_NAME))?,
520        );
521    }
522
523    let (input_kind, subject) = extract_strsplit_subject(text)?;
524    let options = StrsplitOptions::parse(&args)?;
525    let (parts, matches) = strsplit_text(&subject, &options)?;
526    let parts_value = make_strsplit_output(parts, input_kind)?;
527
528    if let Some(out_count) = crate::output_count::current_output_count() {
529        if out_count == 0 {
530            return Ok(Value::OutputList(Vec::new()));
531        }
532        let matches_value = make_strsplit_output(matches, input_kind)?;
533        return Ok(crate::output_count::output_list_with_padding(
534            out_count,
535            vec![parts_value, matches_value],
536        ));
537    }
538
539    Ok(parts_value)
540}
541
542#[derive(Clone)]
543enum DelimiterSpec {
544    Whitespace,
545    Patterns(Vec<String>),
546}
547
548#[derive(Clone)]
549struct SplitOptions {
550    delimiters: DelimiterSpec,
551    collapse_delimiters: bool,
552    include_delimiters: bool,
553}
554
555impl SplitOptions {
556    fn parse(args: &[Value]) -> BuiltinResult<Self> {
557        let mut index = 0usize;
558        let mut delimiters = DelimiterSpec::Whitespace;
559
560        if index < args.len() && !is_name_key(&args[index]) {
561            let list = extract_delimiters(&args[index])?;
562            if list.is_empty() {
563                return Err(split_error(&SPLIT_ERROR_EMPTY_DELIMITER));
564            }
565            let mut seen = HashSet::new();
566            let mut patterns: Vec<String> = Vec::new();
567            for pattern in list {
568                if pattern.is_empty() {
569                    return Err(split_error(&SPLIT_ERROR_EMPTY_DELIMITER));
570                }
571                if seen.insert(pattern.clone()) {
572                    patterns.push(pattern);
573                }
574            }
575            patterns.sort_by_key(|pat| std::cmp::Reverse(pat.len()));
576            delimiters = DelimiterSpec::Patterns(patterns);
577            index += 1;
578        }
579
580        let mut collapse = match delimiters {
581            DelimiterSpec::Whitespace => true,
582            DelimiterSpec::Patterns(_) => false,
583        };
584        let mut include = false;
585
586        while index < args.len() {
587            let name = match name_key(&args[index]) {
588                Some(NameKey::CollapseDelimiters) => NameKey::CollapseDelimiters,
589                Some(NameKey::IncludeDelimiters) => NameKey::IncludeDelimiters,
590                None => return Err(split_error(&SPLIT_ERROR_UNKNOWN_NAME)),
591            };
592            index += 1;
593            if index >= args.len() {
594                return Err(split_error(&SPLIT_ERROR_NAME_VALUE_PAIR));
595            }
596            let value = &args[index];
597            index += 1;
598
599            match name {
600                NameKey::CollapseDelimiters => {
601                    collapse = parse_bool(value, "CollapseDelimiters")?;
602                }
603                NameKey::IncludeDelimiters => {
604                    include = parse_bool(value, "IncludeDelimiters")?;
605                }
606            }
607        }
608
609        Ok(Self {
610            delimiters,
611            collapse_delimiters: collapse,
612            include_delimiters: include,
613        })
614    }
615}
616
617struct TextMatrix {
618    data: Vec<String>,
619    rows: usize,
620    cols: usize,
621}
622
623impl TextMatrix {
624    fn from_value(value: Value) -> BuiltinResult<Self> {
625        match value {
626            Value::String(text) => Ok(Self {
627                data: vec![text],
628                rows: 1,
629                cols: 1,
630            }),
631            Value::StringArray(array) => Ok(Self {
632                data: array.data,
633                rows: array.rows,
634                cols: array.cols,
635            }),
636            Value::CharArray(array) => Self::from_char_array(array),
637            Value::Cell(cell) => Self::from_cell_array(cell),
638            _ => Err(split_error(&SPLIT_ERROR_INVALID_INPUT)),
639        }
640    }
641
642    fn from_char_array(array: CharArray) -> BuiltinResult<Self> {
643        let CharArray { data, rows, cols } = array;
644        if rows == 0 {
645            return Ok(Self {
646                data: Vec::new(),
647                rows: 0,
648                cols: 1,
649            });
650        }
651        let mut strings = Vec::with_capacity(rows);
652        for row in 0..rows {
653            strings.push(char_row_to_string_slice(&data, cols, row));
654        }
655        Ok(Self {
656            data: strings,
657            rows,
658            cols: 1,
659        })
660    }
661
662    fn from_cell_array(cell: CellArray) -> BuiltinResult<Self> {
663        let CellArray {
664            data, rows, cols, ..
665        } = cell;
666        let mut strings = Vec::with_capacity(data.len());
667        for col in 0..cols {
668            for row in 0..rows {
669                let idx = row * cols + col;
670                let value_ref: &Value = &data[idx];
671                strings.push(
672                    cell_element_to_string(value_ref)
673                        .ok_or_else(|| split_error(&SPLIT_ERROR_CELL_ELEMENT))?,
674                );
675            }
676        }
677        Ok(Self {
678            data: strings,
679            rows,
680            cols,
681        })
682    }
683
684    fn into_split_result(self, options: &SplitOptions) -> BuiltinResult<Value> {
685        let TextMatrix { data, rows, cols } = self;
686
687        if data.is_empty() {
688            let block_cols = if cols == 0 { 0 } else { 1 };
689            let shape = if cols == 0 {
690                vec![rows, 0]
691            } else {
692                vec![rows, cols * block_cols]
693            };
694            let array = StringArray::new(Vec::new(), shape).map_err(|e| {
695                split_error_with_message(format!("{BUILTIN_NAME}: {e}"), &SPLIT_ERROR_INTERNAL)
696            })?;
697            return Ok(Value::StringArray(array));
698        }
699
700        let mut per_element: Vec<Vec<String>> = Vec::with_capacity(data.len());
701        let mut max_tokens = 0usize;
702        for text in &data {
703            let tokens = split_text(text, options);
704            max_tokens = max_tokens.max(tokens.len());
705            per_element.push(tokens);
706        }
707        if max_tokens == 0 {
708            max_tokens = 1;
709        }
710        let block_cols = max_tokens;
711        let result_cols = block_cols * cols.max(1);
712        let total = rows * result_cols;
713        let missing = "<missing>".to_string();
714        let mut output = vec![missing.clone(); total];
715
716        for col in 0..cols.max(1) {
717            for row in 0..rows {
718                let element_index = if cols == 0 { row } else { row + col * rows };
719                if element_index >= per_element.len() {
720                    continue;
721                }
722                let tokens = &per_element[element_index];
723                for t in 0..block_cols {
724                    let out_col = if cols == 0 { t } else { col * block_cols + t };
725                    let out_index = row + out_col * rows;
726                    if out_index >= output.len() {
727                        continue;
728                    }
729                    if t < tokens.len() {
730                        output[out_index] = tokens[t].clone();
731                    } else {
732                        output[out_index] = missing.clone();
733                    }
734                }
735            }
736        }
737
738        let shape = vec![rows, result_cols];
739        let array = StringArray::new(output, shape).map_err(|e| {
740            split_error_with_message(format!("{BUILTIN_NAME}: {e}"), &SPLIT_ERROR_INTERNAL)
741        })?;
742        Ok(Value::StringArray(array))
743    }
744}
745
746fn split_text(text: &str, options: &SplitOptions) -> Vec<String> {
747    if is_missing_string(text) {
748        return vec![text.to_string()];
749    }
750    match &options.delimiters {
751        DelimiterSpec::Whitespace => split_whitespace(text, options),
752        DelimiterSpec::Patterns(patterns) => split_by_patterns(text, patterns, options),
753    }
754}
755
756fn split_whitespace(text: &str, options: &SplitOptions) -> Vec<String> {
757    if text.is_empty() {
758        return vec![String::new()];
759    }
760
761    let mut parts: Vec<String> = Vec::new();
762    let mut idx = 0usize;
763    let mut last = 0usize;
764    let len = text.len();
765
766    while idx < len {
767        let ch = text[idx..].chars().next().unwrap();
768        let width = ch.len_utf8();
769        if !ch.is_whitespace() {
770            idx += width;
771            continue;
772        }
773
774        let token = &text[last..idx];
775        if !token.is_empty() || !options.collapse_delimiters {
776            parts.push(token.to_string());
777        }
778
779        let run_end = advance_whitespace(text, idx);
780        if options.include_delimiters {
781            if options.collapse_delimiters {
782                parts.push(text[idx..run_end].to_string());
783            } else {
784                parts.push(text[idx..idx + width].to_string());
785            }
786        }
787
788        if options.collapse_delimiters {
789            idx = run_end;
790            last = run_end;
791        } else {
792            idx += width;
793            last = idx;
794        }
795    }
796
797    let tail = &text[last..];
798    if !tail.is_empty() || !options.collapse_delimiters {
799        parts.push(tail.to_string());
800    }
801    if parts.is_empty() {
802        parts.push(String::new());
803    }
804    parts
805}
806
807fn split_by_patterns(text: &str, patterns: &[String], options: &SplitOptions) -> Vec<String> {
808    if patterns.is_empty() {
809        return vec![text.to_string()];
810    }
811
812    let mut parts: Vec<String> = Vec::new();
813    let mut idx = 0usize;
814    let mut last = 0usize;
815    while idx < text.len() {
816        if let Some(pattern) = patterns
817            .iter()
818            .find(|candidate| text[idx..].starts_with(candidate.as_str()))
819        {
820            let token = &text[last..idx];
821            if !token.is_empty() || !options.collapse_delimiters {
822                parts.push(token.to_string());
823            }
824
825            let pat_len = pattern.len();
826            if options.collapse_delimiters {
827                let mut run_end = idx + pat_len;
828                while run_end < text.len() {
829                    if let Some(next) = patterns
830                        .iter()
831                        .find(|candidate| text[run_end..].starts_with(candidate.as_str()))
832                    {
833                        let len = next.len();
834                        if len == 0 {
835                            break;
836                        }
837                        run_end += len;
838                    } else {
839                        break;
840                    }
841                }
842                if options.include_delimiters {
843                    parts.push(text[idx..run_end].to_string());
844                }
845                idx = run_end;
846                last = run_end;
847            } else {
848                if options.include_delimiters {
849                    parts.push(text[idx..idx + pat_len].to_string());
850                }
851                idx += pat_len;
852                last = idx;
853            }
854
855            continue;
856        }
857        let ch = text[idx..].chars().next().unwrap();
858        idx += ch.len_utf8();
859    }
860    let tail = &text[last..];
861    if !tail.is_empty() || !options.collapse_delimiters {
862        parts.push(tail.to_string());
863    }
864    if parts.is_empty() {
865        parts.push(String::new());
866    }
867    parts
868}
869
870fn advance_whitespace(text: &str, mut start: usize) -> usize {
871    while start < text.len() {
872        let ch = text[start..].chars().next().unwrap();
873        if !ch.is_whitespace() {
874            break;
875        }
876        start += ch.len_utf8();
877    }
878    start
879}
880
881fn extract_delimiters(value: &Value) -> BuiltinResult<Vec<String>> {
882    match value {
883        Value::String(text) => Ok(vec![text.clone()]),
884        Value::StringArray(array) => Ok(array.data.clone()),
885        Value::CharArray(array) => {
886            if array.rows == 0 {
887                return Ok(Vec::new());
888            }
889            let mut entries = Vec::with_capacity(array.rows);
890            for row in 0..array.rows {
891                entries.push(char_row_to_string_slice(&array.data, array.cols, row));
892            }
893            Ok(entries)
894        }
895        Value::Cell(cell) => {
896            let mut entries = Vec::with_capacity(cell.data.len());
897            for element in &cell.data {
898                entries.push(
899                    cell_element_to_string(element)
900                        .ok_or_else(|| split_error(&SPLIT_ERROR_CELL_ELEMENT))?,
901                );
902            }
903            Ok(entries)
904        }
905        _ => Err(split_error(&SPLIT_ERROR_DELIMITER_TYPE)),
906    }
907}
908
909fn cell_element_to_string(value: &Value) -> Option<String> {
910    match value {
911        Value::String(text) => Some(text.clone()),
912        Value::StringArray(array) if array.data.len() == 1 => Some(array.data[0].clone()),
913        Value::CharArray(array) if array.rows <= 1 => {
914            if array.rows == 0 {
915                Some(String::new())
916            } else {
917                Some(char_row_to_string_slice(&array.data, array.cols, 0))
918            }
919        }
920        _ => None,
921    }
922}
923
924fn value_to_scalar_string(value: &Value) -> Option<String> {
925    match value {
926        Value::String(text) => Some(text.clone()),
927        Value::StringArray(array) if array.data.len() == 1 => Some(array.data[0].clone()),
928        Value::CharArray(array) if array.rows <= 1 => {
929            if array.rows == 0 {
930                Some(String::new())
931            } else {
932                Some(char_row_to_string_slice(&array.data, array.cols, 0))
933            }
934        }
935        Value::Cell(cell) if cell.data.len() == 1 => cell_element_to_string(&cell.data[0]),
936        _ => None,
937    }
938}
939
940fn parse_bool(value: &Value, name: &str) -> BuiltinResult<bool> {
941    parse_bool_for_builtin(value, name, BUILTIN_NAME, &SPLIT_ERROR_OPTION_VALUE)
942}
943
944fn parse_bool_for_builtin(
945    value: &Value,
946    name: &str,
947    builtin_name: &'static str,
948    error: &'static BuiltinErrorDescriptor,
949) -> BuiltinResult<bool> {
950    match value {
951        Value::Bool(b) => Ok(*b),
952        Value::Int(i) => Ok(i.to_i64() != 0),
953        Value::Num(n) => Ok(*n != 0.0),
954        Value::LogicalArray(array) => {
955            if array.data.len() == 1 {
956                Ok(array.data[0] != 0)
957            } else {
958                Err(builtin_error_with_descriptor(
959                    builtin_name,
960                    format!(
961                        "{builtin_name}: value for '{}' must be logical true or false",
962                        name
963                    ),
964                    error,
965                ))
966            }
967        }
968        Value::Tensor(tensor) => {
969            if tensor.data.len() == 1 {
970                Ok(tensor.data[0] != 0.0)
971            } else {
972                Err(builtin_error_with_descriptor(
973                    builtin_name,
974                    format!(
975                        "{builtin_name}: value for '{}' must be logical true or false",
976                        name
977                    ),
978                    error,
979                ))
980            }
981        }
982        _ => {
983            if let Some(text) = value_to_scalar_string(value) {
984                let lowered = text.trim().to_ascii_lowercase();
985                match lowered.as_str() {
986                    "true" | "on" | "yes" => Ok(true),
987                    "false" | "off" | "no" => Ok(false),
988                    _ => Err(builtin_error_with_descriptor(
989                        builtin_name,
990                        format!(
991                            "{builtin_name}: value for '{}' must be logical true or false",
992                            name
993                        ),
994                        error,
995                    )),
996                }
997            } else {
998                Err(builtin_error_with_descriptor(
999                    builtin_name,
1000                    format!(
1001                        "{builtin_name}: value for '{}' must be logical true or false",
1002                        name
1003                    ),
1004                    error,
1005                ))
1006            }
1007        }
1008    }
1009}
1010
1011fn builtin_error_with_descriptor(
1012    builtin_name: &'static str,
1013    message: impl Into<String>,
1014    error: &'static BuiltinErrorDescriptor,
1015) -> RuntimeError {
1016    let mut builder = build_runtime_error(message).with_builtin(builtin_name);
1017    if let Some(identifier) = error.identifier {
1018        builder = builder.with_identifier(identifier);
1019    }
1020    builder.build()
1021}
1022
1023fn extract_strsplit_subject(value: Value) -> BuiltinResult<(StrsplitInputKind, String)> {
1024    match value {
1025        Value::String(text) => Ok((StrsplitInputKind::String, text)),
1026        Value::StringArray(array) if array.data.len() == 1 => {
1027            Ok((StrsplitInputKind::String, array.data[0].clone()))
1028        }
1029        Value::CharArray(array) if array.rows <= 1 => {
1030            if array.rows == 0 {
1031                Ok((StrsplitInputKind::Char, String::new()))
1032            } else {
1033                Ok((
1034                    StrsplitInputKind::Char,
1035                    char_row_to_string_slice(&array.data, array.cols, 0),
1036                ))
1037            }
1038        }
1039        _ => Err(strsplit_error(&STRSPLIT_ERROR_INVALID_INPUT)),
1040    }
1041}
1042
1043fn strsplit_text(
1044    text: &str,
1045    options: &StrsplitOptions,
1046) -> BuiltinResult<(Vec<String>, Vec<String>)> {
1047    let regex = compile_strsplit_regex(options)?;
1048    let mut parts = Vec::new();
1049    let mut matches = Vec::new();
1050    let mut last = 0usize;
1051
1052    for found in regex.find_iter(text) {
1053        parts.push(text[last..found.start()].to_string());
1054        matches.push(found.as_str().to_string());
1055        last = found.end();
1056    }
1057
1058    parts.push(text[last..].to_string());
1059    Ok((parts, matches))
1060}
1061
1062fn compile_strsplit_regex(options: &StrsplitOptions) -> BuiltinResult<regex::Regex> {
1063    let pattern = match (&options.delimiters, options.delimiter_type) {
1064        (None, _) => {
1065            if options.collapse_delimiters {
1066                "[\\x20\\x0C\\n\\r\\t\\x0B]+".to_string()
1067            } else {
1068                "[\\x20\\x0C\\n\\r\\t\\x0B]".to_string()
1069            }
1070        }
1071        (Some(delimiters), StrsplitDelimiterType::Simple) => {
1072            let alternation = delimiters
1073                .iter()
1074                .map(|pattern| regex::escape(pattern))
1075                .collect::<Vec<_>>()
1076                .join("|");
1077            if options.collapse_delimiters {
1078                format!("(?:{alternation})+")
1079            } else {
1080                format!("(?:{alternation})")
1081            }
1082        }
1083        (Some(delimiters), StrsplitDelimiterType::RegularExpression) => {
1084            let alternation = delimiters.join("|");
1085            if options.collapse_delimiters {
1086                format!("(?:{alternation})+")
1087            } else {
1088                format!("(?:{alternation})")
1089            }
1090        }
1091    };
1092
1093    RegexBuilder::new(&pattern).build().map_err(|err| {
1094        strsplit_error_with_message(format!("strsplit: {err}"), &STRSPLIT_ERROR_REGEX_INVALID)
1095    })
1096}
1097
1098fn make_strsplit_output(tokens: Vec<String>, kind: StrsplitInputKind) -> BuiltinResult<Value> {
1099    match kind {
1100        StrsplitInputKind::String => {
1101            let len = tokens.len();
1102            let array = StringArray::new(tokens, vec![1, len]).map_err(|err| {
1103                strsplit_error_with_message(format!("strsplit: {err}"), &STRSPLIT_ERROR_INTERNAL)
1104            })?;
1105            Ok(Value::StringArray(array))
1106        }
1107        StrsplitInputKind::Char => {
1108            let values: Vec<Value> = tokens.into_iter().map(Value::String).collect();
1109            let len = values.len();
1110            make_cell(values, 1, len).map_err(|err| {
1111                strsplit_error_with_message(format!("strsplit: {err}"), &STRSPLIT_ERROR_INTERNAL)
1112            })
1113        }
1114    }
1115}
1116
1117#[derive(PartialEq, Eq)]
1118enum NameKey {
1119    CollapseDelimiters,
1120    IncludeDelimiters,
1121}
1122
1123#[derive(Clone, Copy)]
1124enum StrsplitInputKind {
1125    Char,
1126    String,
1127}
1128
1129#[derive(Clone, Copy)]
1130enum StrsplitDelimiterType {
1131    Simple,
1132    RegularExpression,
1133}
1134
1135#[derive(Clone)]
1136struct StrsplitOptions {
1137    delimiters: Option<Vec<String>>,
1138    collapse_delimiters: bool,
1139    delimiter_type: StrsplitDelimiterType,
1140}
1141
1142impl StrsplitOptions {
1143    fn parse(args: &[Value]) -> BuiltinResult<Self> {
1144        let mut index = 0usize;
1145        let mut delimiters = None;
1146
1147        if index < args.len() && !is_strsplit_name_key(&args[index]) {
1148            let list = extract_delimiters(&args[index])
1149                .map_err(|_| strsplit_error(&STRSPLIT_ERROR_DELIMITER_TYPE))?;
1150            delimiters = Some(list);
1151            index += 1;
1152        }
1153
1154        let mut collapse_delimiters = true;
1155        let mut delimiter_type = StrsplitDelimiterType::Simple;
1156
1157        while index < args.len() {
1158            let name = match strsplit_name_key(&args[index]) {
1159                Some(name) => name,
1160                None => return Err(strsplit_error(&STRSPLIT_ERROR_UNKNOWN_NAME)),
1161            };
1162            index += 1;
1163            if index >= args.len() {
1164                return Err(strsplit_error(&STRSPLIT_ERROR_NAME_VALUE_PAIR));
1165            }
1166            let value = &args[index];
1167            index += 1;
1168
1169            match name {
1170                StrsplitNameKey::CollapseDelimiters => {
1171                    collapse_delimiters = parse_bool_for_builtin(
1172                        value,
1173                        "CollapseDelimiters",
1174                        STRSPLIT_BUILTIN_NAME,
1175                        &STRSPLIT_ERROR_OPTION_VALUE,
1176                    )?;
1177                }
1178                StrsplitNameKey::DelimiterType => {
1179                    let text = value_to_scalar_string(value)
1180                        .ok_or_else(|| strsplit_error(&STRSPLIT_ERROR_DELIMITER_MODE))?;
1181                    delimiter_type = match text.trim().to_ascii_lowercase().as_str() {
1182                        "simple" => StrsplitDelimiterType::Simple,
1183                        "regularexpression" => StrsplitDelimiterType::RegularExpression,
1184                        _ => return Err(strsplit_error(&STRSPLIT_ERROR_DELIMITER_MODE)),
1185                    };
1186                }
1187            }
1188        }
1189
1190        if let Some(patterns) = &delimiters {
1191            if patterns.is_empty() {
1192                return Err(strsplit_error(&STRSPLIT_ERROR_EMPTY_DELIMITER));
1193            }
1194            if matches!(delimiter_type, StrsplitDelimiterType::Simple)
1195                && patterns.iter().any(|pattern| pattern.is_empty())
1196            {
1197                return Err(strsplit_error(&STRSPLIT_ERROR_EMPTY_DELIMITER));
1198            }
1199        }
1200
1201        Ok(Self {
1202            delimiters,
1203            collapse_delimiters,
1204            delimiter_type,
1205        })
1206    }
1207}
1208
1209#[derive(PartialEq, Eq)]
1210enum StrsplitNameKey {
1211    CollapseDelimiters,
1212    DelimiterType,
1213}
1214
1215fn is_name_key(value: &Value) -> bool {
1216    name_key(value).is_some()
1217}
1218
1219fn is_strsplit_name_key(value: &Value) -> bool {
1220    strsplit_name_key(value).is_some()
1221}
1222
1223fn name_key(value: &Value) -> Option<NameKey> {
1224    value_to_scalar_string(value).and_then(|text| {
1225        let lowered = text.trim().to_ascii_lowercase();
1226        match lowered.as_str() {
1227            "collapsedelimiters" => Some(NameKey::CollapseDelimiters),
1228            "includedelimiters" => Some(NameKey::IncludeDelimiters),
1229            _ => None,
1230        }
1231    })
1232}
1233
1234fn strsplit_name_key(value: &Value) -> Option<StrsplitNameKey> {
1235    value_to_scalar_string(value).and_then(|text| {
1236        let lowered = text.trim().to_ascii_lowercase();
1237        match lowered.as_str() {
1238            "collapsedelimiters" => Some(StrsplitNameKey::CollapseDelimiters),
1239            "delimitertype" => Some(StrsplitNameKey::DelimiterType),
1240            _ => None,
1241        }
1242    })
1243}
1244
1245#[cfg(test)]
1246pub(crate) mod tests {
1247    use super::*;
1248    use runmat_builtins::{CellArray, LogicalArray, ResolveContext, Tensor, Type};
1249
1250    fn split_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
1251        futures::executor::block_on(super::split_builtin(text, rest))
1252    }
1253
1254    fn strsplit_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
1255        futures::executor::block_on(super::strsplit_builtin(text, rest))
1256    }
1257
1258    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1259    #[test]
1260    fn split_string_whitespace_default() {
1261        let input = Value::String("RunMat Accelerate Planner".to_string());
1262        let result = split_builtin(input, Vec::new()).expect("split");
1263        match result {
1264            Value::StringArray(array) => {
1265                assert_eq!(array.shape, vec![1, 3]);
1266                assert_eq!(
1267                    array.data,
1268                    vec![
1269                        "RunMat".to_string(),
1270                        "Accelerate".to_string(),
1271                        "Planner".to_string()
1272                    ]
1273                );
1274            }
1275            other => panic!("expected string array, got {other:?}"),
1276        }
1277    }
1278
1279    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1280    #[test]
1281    fn split_string_custom_delimiter() {
1282        let input = Value::String("alpha,beta,gamma".to_string());
1283        let args = vec![Value::String(",".to_string())];
1284        let result = split_builtin(input, args).expect("split");
1285        match result {
1286            Value::StringArray(array) => {
1287                assert_eq!(array.shape, vec![1, 3]);
1288                assert_eq!(
1289                    array.data,
1290                    vec!["alpha".to_string(), "beta".to_string(), "gamma".to_string()]
1291                );
1292            }
1293            other => panic!("expected string array, got {other:?}"),
1294        }
1295    }
1296
1297    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1298    #[test]
1299    fn split_include_delimiters_true() {
1300        let input = Value::String("A+B-C".to_string());
1301        let args = vec![
1302            Value::StringArray(
1303                StringArray::new(vec!["+".to_string(), "-".to_string()], vec![1, 2]).unwrap(),
1304            ),
1305            Value::String("IncludeDelimiters".to_string()),
1306            Value::Bool(true),
1307        ];
1308        let result = split_builtin(input, args).expect("split");
1309        match result {
1310            Value::StringArray(array) => {
1311                assert_eq!(array.shape, vec![1, 5]);
1312                assert_eq!(
1313                    array.data,
1314                    vec![
1315                        "A".to_string(),
1316                        "+".to_string(),
1317                        "B".to_string(),
1318                        "-".to_string(),
1319                        "C".to_string()
1320                    ]
1321                );
1322            }
1323            other => panic!("expected string array, got {other:?}"),
1324        }
1325    }
1326
1327    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1328    #[test]
1329    fn split_include_delimiters_whitespace_collapse_default() {
1330        let input = Value::String("A  B".to_string());
1331        let args = vec![
1332            Value::String("IncludeDelimiters".to_string()),
1333            Value::Bool(true),
1334        ];
1335        let result = split_builtin(input, args).expect("split");
1336        match result {
1337            Value::StringArray(array) => {
1338                assert_eq!(array.shape, vec![1, 3]);
1339                assert_eq!(
1340                    array.data,
1341                    vec!["A".to_string(), "  ".to_string(), "B".to_string()]
1342                );
1343            }
1344            other => panic!("expected string array, got {other:?}"),
1345        }
1346    }
1347
1348    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1349    #[test]
1350    fn split_patterns_include_delimiters_collapse_true() {
1351        let input = Value::String("a,,b".to_string());
1352        let args = vec![
1353            Value::String(",".to_string()),
1354            Value::String("IncludeDelimiters".to_string()),
1355            Value::Bool(true),
1356            Value::String("CollapseDelimiters".to_string()),
1357            Value::Bool(true),
1358        ];
1359        let result = split_builtin(input, args).expect("split");
1360        match result {
1361            Value::StringArray(array) => {
1362                assert_eq!(array.shape, vec![1, 3]);
1363                assert_eq!(
1364                    array.data,
1365                    vec!["a".to_string(), ",,".to_string(), "b".to_string()]
1366                );
1367            }
1368            other => panic!("expected string array, got {other:?}"),
1369        }
1370    }
1371
1372    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1373    #[test]
1374    fn split_collapse_false_preserves_empty_segments() {
1375        let input = Value::String("one,,three,".to_string());
1376        let args = vec![
1377            Value::String(",".to_string()),
1378            Value::String("CollapseDelimiters".to_string()),
1379            Value::Bool(false),
1380        ];
1381        let result = split_builtin(input, args).expect("split");
1382        match result {
1383            Value::StringArray(array) => {
1384                assert_eq!(array.shape, vec![1, 4]);
1385                assert_eq!(
1386                    array.data,
1387                    vec![
1388                        "one".to_string(),
1389                        "".to_string(),
1390                        "three".to_string(),
1391                        "".to_string()
1392                    ]
1393                );
1394            }
1395            other => panic!("expected string array, got {other:?}"),
1396        }
1397    }
1398
1399    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1400    #[test]
1401    fn split_character_array_rows() {
1402        let mut row1: Vec<char> = "GPU Accelerate".chars().collect();
1403        let mut row2: Vec<char> = "VM Engine".chars().collect();
1404        let width = row1.len().max(row2.len());
1405        row1.resize(width, ' ');
1406        row2.resize(width, ' ');
1407        let mut data = row1;
1408        data.extend(row2);
1409        let char_array = CharArray::new(data, 2, width).unwrap();
1410        let input = Value::CharArray(char_array);
1411        let result = split_builtin(input, Vec::new()).expect("split");
1412        match result {
1413            Value::StringArray(array) => {
1414                assert_eq!(array.shape, vec![2, 2]);
1415                assert_eq!(
1416                    array.data,
1417                    vec![
1418                        "GPU".to_string(),
1419                        "VM".to_string(),
1420                        "Accelerate".to_string(),
1421                        "Engine".to_string()
1422                    ]
1423                );
1424            }
1425            other => panic!("expected string array, got {other:?}"),
1426        }
1427    }
1428
1429    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1430    #[test]
1431    fn split_string_array_multiple_columns() {
1432        let data = vec![
1433            "RunMat Core".to_string(),
1434            "VM Interpreter".to_string(),
1435            "Accelerate Engine".to_string(),
1436            "<missing>".to_string(),
1437        ];
1438        let array = StringArray::new(data, vec![2, 2]).unwrap();
1439        let input = Value::StringArray(array);
1440        let result = split_builtin(input, Vec::new()).expect("split");
1441        match result {
1442            Value::StringArray(array) => {
1443                assert_eq!(array.shape, vec![2, 4]);
1444                assert_eq!(
1445                    array.data,
1446                    vec![
1447                        "RunMat".to_string(),
1448                        "VM".to_string(),
1449                        "Core".to_string(),
1450                        "Interpreter".to_string(),
1451                        "Accelerate".to_string(),
1452                        "<missing>".to_string(),
1453                        "Engine".to_string(),
1454                        "<missing>".to_string()
1455                    ]
1456                );
1457            }
1458            other => panic!("expected string array, got {other:?}"),
1459        }
1460    }
1461
1462    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1463    #[test]
1464    fn split_cell_array_outputs_string_array() {
1465        let values = vec![
1466            Value::String("RunMat Snapshot".to_string()),
1467            Value::String("Fusion Planner".to_string()),
1468        ];
1469        let cell = crate::make_cell(values, 2, 1).expect("cell");
1470        let result = split_builtin(cell, vec![Value::String(" ".to_string())]).expect("split");
1471        match result {
1472            Value::StringArray(array) => {
1473                assert_eq!(array.shape, vec![2, 2]);
1474                assert_eq!(
1475                    array.data,
1476                    vec![
1477                        "RunMat".to_string(),
1478                        "Fusion".to_string(),
1479                        "Snapshot".to_string(),
1480                        "Planner".to_string()
1481                    ]
1482                );
1483            }
1484            other => panic!("expected string array, got {other:?}"),
1485        }
1486    }
1487
1488    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1489    #[test]
1490    fn split_cell_array_multiple_columns() {
1491        let values = vec![
1492            Value::String("alpha beta".to_string()),
1493            Value::String("gamma".to_string()),
1494            Value::String("delta epsilon".to_string()),
1495            Value::String("<missing>".to_string()),
1496        ];
1497        let cell = crate::make_cell(values, 2, 2).expect("cell");
1498        let result = split_builtin(cell, Vec::new()).expect("split");
1499        match result {
1500            Value::StringArray(array) => {
1501                assert_eq!(array.shape, vec![2, 4]);
1502                assert_eq!(
1503                    array.data,
1504                    vec![
1505                        "alpha".to_string(),
1506                        "delta".to_string(),
1507                        "beta".to_string(),
1508                        "epsilon".to_string(),
1509                        "gamma".to_string(),
1510                        "<missing>".to_string(),
1511                        "<missing>".to_string(),
1512                        "<missing>".to_string()
1513                    ]
1514                );
1515            }
1516            other => panic!("expected string array, got {other:?}"),
1517        }
1518    }
1519
1520    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1521    #[test]
1522    fn split_missing_string_propagates() {
1523        let input = Value::String("<missing>".to_string());
1524        let result = split_builtin(input, Vec::new()).expect("split");
1525        match result {
1526            Value::StringArray(array) => {
1527                assert_eq!(array.shape, vec![1, 1]);
1528                assert_eq!(array.data, vec!["<missing>".to_string()]);
1529            }
1530            other => panic!("expected string array, got {other:?}"),
1531        }
1532    }
1533
1534    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1535    #[test]
1536    fn split_invalid_name_value_pair_errors() {
1537        let input = Value::String("abc".to_string());
1538        let args = vec![Value::String("CollapseDelimiters".to_string())];
1539        let err = split_builtin(input, args).unwrap_err();
1540        assert!(err.to_string().contains("name-value"));
1541    }
1542
1543    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1544    #[test]
1545    fn split_invalid_text_argument_errors() {
1546        let err = split_builtin(Value::Num(1.0), Vec::new()).unwrap_err();
1547        assert!(err.to_string().contains("first argument"));
1548    }
1549
1550    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1551    #[test]
1552    fn split_invalid_delimiter_type_errors() {
1553        let err =
1554            split_builtin(Value::String("abc".to_string()), vec![Value::Num(1.0)]).unwrap_err();
1555        assert!(err.to_string().contains("delimiter input"));
1556    }
1557
1558    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1559    #[test]
1560    fn split_empty_delimiter_errors() {
1561        let err = split_builtin(
1562            Value::String("abc".to_string()),
1563            vec![Value::String(String::new())],
1564        )
1565        .unwrap_err();
1566        assert!(err.to_string().contains("at least one character"));
1567    }
1568
1569    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1570    #[test]
1571    fn split_unknown_name_argument_errors() {
1572        let err = split_builtin(
1573            Value::String("abc".to_string()),
1574            vec![
1575                Value::String("UnknownOption".to_string()),
1576                Value::Bool(true),
1577            ],
1578        )
1579        .unwrap_err();
1580        assert!(err.to_string().contains("unrecognized"));
1581    }
1582
1583    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1584    #[test]
1585    fn split_collapse_delimiters_accepts_logical_array() {
1586        let logical = LogicalArray::new(vec![1u8], vec![1]).unwrap();
1587        let args = vec![
1588            Value::String(",".to_string()),
1589            Value::String("CollapseDelimiters".to_string()),
1590            Value::LogicalArray(logical),
1591        ];
1592        let result = split_builtin(Value::String("a,,b".to_string()), args).expect("split");
1593        match result {
1594            Value::StringArray(array) => {
1595                assert_eq!(array.shape, vec![1, 2]);
1596                assert_eq!(array.data, vec!["a".to_string(), "b".to_string()]);
1597            }
1598            other => panic!("expected string array, got {other:?}"),
1599        }
1600    }
1601
1602    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1603    #[test]
1604    fn split_include_delimiters_accepts_tensor_scalar() {
1605        let tensor = Tensor::new(vec![1.0], vec![1, 1]).unwrap();
1606        let args = vec![
1607            Value::String(",".to_string()),
1608            Value::String("IncludeDelimiters".to_string()),
1609            Value::Tensor(tensor),
1610        ];
1611        let result = split_builtin(Value::String("a,b".to_string()), args).expect("split");
1612        match result {
1613            Value::StringArray(array) => {
1614                assert_eq!(array.shape, vec![1, 3]);
1615                assert_eq!(
1616                    array.data,
1617                    vec!["a".to_string(), ",".to_string(), "b".to_string()]
1618                );
1619            }
1620            other => panic!("expected string array, got {other:?}"),
1621        }
1622    }
1623
1624    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1625    #[test]
1626    fn split_cell_array_mixed_inputs() {
1627        let handles: Vec<_> = vec![
1628            runmat_gc::gc_allocate(Value::String("alpha beta".to_string())).unwrap(),
1629            runmat_gc::gc_allocate(Value::CharArray(
1630                CharArray::new("gamma".chars().collect(), 1, 5).unwrap(),
1631            ))
1632            .unwrap(),
1633        ];
1634        let cell =
1635            Value::Cell(CellArray::new_handles(handles, 1, 2).expect("cell array construction"));
1636        let result = split_builtin(cell, Vec::new()).expect("split");
1637        match result {
1638            Value::StringArray(array) => {
1639                assert_eq!(array.shape, vec![1, 4]);
1640                assert_eq!(
1641                    array.data,
1642                    vec![
1643                        "alpha".to_string(),
1644                        "beta".to_string(),
1645                        "gamma".to_string(),
1646                        "<missing>".to_string()
1647                    ]
1648                );
1649            }
1650            other => panic!("expected string array, got {other:?}"),
1651        }
1652    }
1653
1654    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1655    #[test]
1656    fn strsplit_string_scalar_returns_string_array() {
1657        let result =
1658            strsplit_builtin(Value::String("one two  three".into()), Vec::new()).expect("strsplit");
1659        match result {
1660            Value::StringArray(array) => {
1661                assert_eq!(array.shape, vec![1, 3]);
1662                assert_eq!(
1663                    array.data,
1664                    vec!["one".to_string(), "two".to_string(), "three".to_string()]
1665                );
1666            }
1667            other => panic!("expected string array, got {other:?}"),
1668        }
1669    }
1670
1671    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1672    #[test]
1673    fn strsplit_char_vector_returns_cell() {
1674        let input = Value::CharArray(CharArray::new("a,b".chars().collect(), 1, 3).unwrap());
1675        let result = strsplit_builtin(input, vec![Value::String(",".into())]).expect("strsplit");
1676        match result {
1677            Value::Cell(cell) => {
1678                assert_eq!(cell.rows, 1);
1679                assert_eq!(cell.cols, 2);
1680                assert_eq!(
1681                    unsafe { &*cell.data[0].as_raw() },
1682                    &Value::String("a".into())
1683                );
1684                assert_eq!(
1685                    unsafe { &*cell.data[1].as_raw() },
1686                    &Value::String("b".into())
1687                );
1688            }
1689            other => panic!("expected cell output, got {other:?}"),
1690        }
1691    }
1692
1693    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1694    #[test]
1695    fn strsplit_multi_output_returns_matches() {
1696        let _guard = crate::output_count::push_output_count(Some(2));
1697        let result = strsplit_builtin(
1698            Value::String("a,,b,".into()),
1699            vec![Value::String(",".into())],
1700        )
1701        .expect("strsplit");
1702        match result {
1703            Value::OutputList(values) => {
1704                assert_eq!(values.len(), 2);
1705                match &values[0] {
1706                    Value::StringArray(array) => {
1707                        assert_eq!(
1708                            array.data,
1709                            vec!["a".to_string(), "b".to_string(), "".to_string()]
1710                        );
1711                    }
1712                    other => panic!("expected first output string array, got {other:?}"),
1713                }
1714                match &values[1] {
1715                    Value::StringArray(array) => {
1716                        assert_eq!(array.data, vec![",,".to_string(), ",".to_string()]);
1717                    }
1718                    other => panic!("expected second output string array, got {other:?}"),
1719                }
1720            }
1721            other => panic!("expected output list, got {other:?}"),
1722        }
1723    }
1724
1725    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1726    #[test]
1727    fn strsplit_regular_expression_mode() {
1728        let _guard = crate::output_count::push_output_count(Some(2));
1729        let result = strsplit_builtin(
1730            Value::String("1.21m/s 1.985 m/s".into()),
1731            vec![
1732                Value::String("\\s*m/s\\s*".into()),
1733                Value::String("DelimiterType".into()),
1734                Value::String("RegularExpression".into()),
1735            ],
1736        )
1737        .expect("strsplit");
1738        match result {
1739            Value::OutputList(values) => {
1740                match &values[0] {
1741                    Value::StringArray(array) => {
1742                        assert_eq!(
1743                            array.data,
1744                            vec!["1.21".to_string(), "1.985".to_string(), "".to_string()]
1745                        );
1746                    }
1747                    other => panic!("expected split output string array, got {other:?}"),
1748                }
1749                match &values[1] {
1750                    Value::StringArray(array) => {
1751                        assert_eq!(array.data, vec!["m/s ".to_string(), " m/s".to_string()]);
1752                    }
1753                    other => panic!("expected matches output string array, got {other:?}"),
1754                }
1755            }
1756            other => panic!("expected output list, got {other:?}"),
1757        }
1758    }
1759
1760    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1761    #[test]
1762    fn strsplit_collapse_false_preserves_empty_segments() {
1763        let result = strsplit_builtin(
1764            Value::String("a,,b".into()),
1765            vec![
1766                Value::String(",".into()),
1767                Value::String("CollapseDelimiters".into()),
1768                Value::Bool(false),
1769            ],
1770        )
1771        .expect("strsplit");
1772        match result {
1773            Value::StringArray(array) => {
1774                assert_eq!(
1775                    array.data,
1776                    vec!["a".to_string(), "".to_string(), "b".to_string()]
1777                );
1778            }
1779            other => panic!("expected string array, got {other:?}"),
1780        }
1781    }
1782
1783    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1784    #[test]
1785    fn strsplit_rejects_nonscalar_text_inputs() {
1786        let input = Value::StringArray(
1787            StringArray::new(vec!["a b".into(), "c d".into()], vec![2, 1]).unwrap(),
1788        );
1789        let err = strsplit_builtin(input, Vec::new()).unwrap_err();
1790        assert!(err.to_string().contains("first argument"));
1791    }
1792
1793    #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1794    #[test]
1795    fn strsplit_invalid_delimiter_type_option_errors() {
1796        let err = strsplit_builtin(
1797            Value::String("a,b".into()),
1798            vec![
1799                Value::String(",".into()),
1800                Value::String("DelimiterType".into()),
1801                Value::String("BadMode".into()),
1802            ],
1803        )
1804        .unwrap_err();
1805        assert!(err.to_string().contains("DelimiterType"));
1806    }
1807
1808    #[test]
1809    fn split_type_is_string_array() {
1810        assert_eq!(
1811            string_array_type(&[Type::String], &ResolveContext::new(Vec::new())),
1812            Type::cell_of(Type::String)
1813        );
1814    }
1815}