1use std::collections::HashSet;
4
5use regex::RegexBuilder;
6use runmat_builtins::{
7 BuiltinCompletionPolicy, BuiltinDescriptor, BuiltinErrorDescriptor, BuiltinOutputMode,
8 BuiltinParamArity, BuiltinParamDescriptor, BuiltinParamType, BuiltinSignatureDescriptor,
9 CellArray, CharArray, StringArray, Value,
10};
11use runmat_macros::runtime_builtin;
12
13use crate::builtins::common::map_control_flow_with_builtin;
14use crate::builtins::common::spec::{
15 BroadcastSemantics, BuiltinFusionSpec, BuiltinGpuSpec, ConstantStrategy, GpuOpKind,
16 ReductionNaN, ResidencyPolicy, ShapeRequirements,
17};
18use crate::builtins::strings::common::{char_row_to_string_slice, is_missing_string};
19use crate::builtins::strings::type_resolvers::{string_array_type, unknown_type};
20use crate::{build_runtime_error, gather_if_needed_async, make_cell, BuiltinResult, RuntimeError};
21
22#[runmat_macros::register_gpu_spec(builtin_path = "crate::builtins::strings::transform::split")]
23pub const GPU_SPEC: BuiltinGpuSpec = BuiltinGpuSpec {
24 name: "split",
25 op_kind: GpuOpKind::Custom("string-transform"),
26 supported_precisions: &[],
27 broadcast: BroadcastSemantics::None,
28 provider_hooks: &[],
29 constant_strategy: ConstantStrategy::InlineLiteral,
30 residency: ResidencyPolicy::GatherImmediately,
31 nan_mode: ReductionNaN::Include,
32 two_pass_threshold: None,
33 workgroup_size: None,
34 accepts_nan_mode: false,
35 notes: "Executes on the CPU; GPU-resident inputs are gathered to host memory before splitting.",
36};
37
38#[runmat_macros::register_fusion_spec(builtin_path = "crate::builtins::strings::transform::split")]
39pub const FUSION_SPEC: BuiltinFusionSpec = BuiltinFusionSpec {
40 name: "split",
41 shape: ShapeRequirements::Any,
42 constant_strategy: ConstantStrategy::InlineLiteral,
43 elementwise: None,
44 reduction: None,
45 emits_nan: false,
46 notes: "String transformation builtin; not eligible for fusion planning and always gathers GPU inputs.",
47};
48
49const BUILTIN_NAME: &str = "split";
50const STRSPLIT_BUILTIN_NAME: &str = "strsplit";
51
52const SPLIT_OUTPUT: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
53 name: "newStr",
54 ty: BuiltinParamType::Any,
55 arity: BuiltinParamArity::Required,
56 default: None,
57 description: "String array containing split tokens.",
58}];
59
60const SPLIT_INPUTS_BASE: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
61 name: "str",
62 ty: BuiltinParamType::Any,
63 arity: BuiltinParamArity::Required,
64 default: None,
65 description: "Input text scalar/array/cell to split.",
66}];
67
68const SPLIT_INPUTS_DELIMITER: [BuiltinParamDescriptor; 2] = [
69 BuiltinParamDescriptor {
70 name: "str",
71 ty: BuiltinParamType::Any,
72 arity: BuiltinParamArity::Required,
73 default: None,
74 description: "Input text scalar/array/cell to split.",
75 },
76 BuiltinParamDescriptor {
77 name: "delimiter",
78 ty: BuiltinParamType::Any,
79 arity: BuiltinParamArity::Required,
80 default: None,
81 description: "Delimiter scalar/array/cell.",
82 },
83];
84
85const SPLIT_INPUTS_DELIMITER_NAMEVALUE: [BuiltinParamDescriptor; 4] = [
86 BuiltinParamDescriptor {
87 name: "str",
88 ty: BuiltinParamType::Any,
89 arity: BuiltinParamArity::Required,
90 default: None,
91 description: "Input text scalar/array/cell to split.",
92 },
93 BuiltinParamDescriptor {
94 name: "delimiter",
95 ty: BuiltinParamType::Any,
96 arity: BuiltinParamArity::Required,
97 default: None,
98 description: "Delimiter scalar/array/cell.",
99 },
100 BuiltinParamDescriptor {
101 name: "Name",
102 ty: BuiltinParamType::StringScalar,
103 arity: BuiltinParamArity::Required,
104 default: None,
105 description: "Option name (`CollapseDelimiters` or `IncludeDelimiters`).",
106 },
107 BuiltinParamDescriptor {
108 name: "Value",
109 ty: BuiltinParamType::Any,
110 arity: BuiltinParamArity::Variadic,
111 default: None,
112 description: "Option values and additional Name/Value pairs.",
113 },
114];
115
116const SPLIT_INPUTS_NAMEVALUE: [BuiltinParamDescriptor; 3] = [
117 BuiltinParamDescriptor {
118 name: "str",
119 ty: BuiltinParamType::Any,
120 arity: BuiltinParamArity::Required,
121 default: None,
122 description: "Input text scalar/array/cell to split.",
123 },
124 BuiltinParamDescriptor {
125 name: "Name",
126 ty: BuiltinParamType::StringScalar,
127 arity: BuiltinParamArity::Required,
128 default: None,
129 description: "Option name (`CollapseDelimiters` or `IncludeDelimiters`).",
130 },
131 BuiltinParamDescriptor {
132 name: "Value",
133 ty: BuiltinParamType::Any,
134 arity: BuiltinParamArity::Variadic,
135 default: None,
136 description: "Option values and additional Name/Value pairs.",
137 },
138];
139
140const SPLIT_SIGNATURES: [BuiltinSignatureDescriptor; 4] = [
141 BuiltinSignatureDescriptor {
142 label: "newStr = split(str)",
143 inputs: &SPLIT_INPUTS_BASE,
144 outputs: &SPLIT_OUTPUT,
145 },
146 BuiltinSignatureDescriptor {
147 label: "newStr = split(str, delimiter)",
148 inputs: &SPLIT_INPUTS_DELIMITER,
149 outputs: &SPLIT_OUTPUT,
150 },
151 BuiltinSignatureDescriptor {
152 label: "newStr = split(str, delimiter, Name, Value, ...)",
153 inputs: &SPLIT_INPUTS_DELIMITER_NAMEVALUE,
154 outputs: &SPLIT_OUTPUT,
155 },
156 BuiltinSignatureDescriptor {
157 label: "newStr = split(str, Name, Value, ...)",
158 inputs: &SPLIT_INPUTS_NAMEVALUE,
159 outputs: &SPLIT_OUTPUT,
160 },
161];
162
163const SPLIT_ERROR_INVALID_INPUT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
164 code: "RM.SPLIT.INVALID_INPUT",
165 identifier: Some("RunMat:split:InvalidInput"),
166 when: "First argument is not a string scalar/array, char array, or cell array of text scalars.",
167 message:
168 "split: first argument must be a string scalar, string array, character array, or cell array of character vectors",
169};
170
171const SPLIT_ERROR_DELIMITER_TYPE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
172 code: "RM.SPLIT.DELIMITER_TYPE",
173 identifier: Some("RunMat:split:DelimiterType"),
174 when: "Delimiter input is not a supported text scalar/array/cell.",
175 message:
176 "split: delimiter input must be a string scalar, string array, character array, or cell array of character vectors",
177};
178
179const SPLIT_ERROR_NAME_VALUE_PAIR: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
180 code: "RM.SPLIT.NAME_VALUE_PAIR",
181 identifier: Some("RunMat:split:NameValuePair"),
182 when: "Name-value options are not supplied in complete pairs.",
183 message: "split: name-value arguments must be supplied in pairs",
184};
185
186const SPLIT_ERROR_UNKNOWN_NAME: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
187 code: "RM.SPLIT.UNKNOWN_NAME",
188 identifier: Some("RunMat:split:UnknownName"),
189 when: "An option name is not recognized.",
190 message:
191 "split: unrecognized name-value argument; supported names are 'CollapseDelimiters' and 'IncludeDelimiters'",
192};
193
194const SPLIT_ERROR_EMPTY_DELIMITER: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
195 code: "RM.SPLIT.EMPTY_DELIMITER",
196 identifier: Some("RunMat:split:EmptyDelimiter"),
197 when: "Delimiter list is empty or contains empty delimiter entries.",
198 message: "split: delimiters must contain at least one character",
199};
200
201const SPLIT_ERROR_CELL_ELEMENT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
202 code: "RM.SPLIT.CELL_ELEMENT",
203 identifier: Some("RunMat:split:CellElement"),
204 when: "Cell arrays contain non-text elements or non-row char arrays.",
205 message: "split: cell array elements must be string scalars or character vectors",
206};
207
208const SPLIT_ERROR_OPTION_VALUE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
209 code: "RM.SPLIT.OPTION_VALUE",
210 identifier: Some("RunMat:split:OptionValue"),
211 when: "Option values are not logical true/false values.",
212 message: "split: option values must be logical true or false",
213};
214
215const SPLIT_ERROR_INTERNAL: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
216 code: "RM.SPLIT.INTERNAL",
217 identifier: Some("RunMat:split:InternalError"),
218 when: "Internal output container construction failed.",
219 message: "split: internal error",
220};
221
222const SPLIT_ERRORS: [BuiltinErrorDescriptor; 8] = [
223 SPLIT_ERROR_INVALID_INPUT,
224 SPLIT_ERROR_DELIMITER_TYPE,
225 SPLIT_ERROR_NAME_VALUE_PAIR,
226 SPLIT_ERROR_UNKNOWN_NAME,
227 SPLIT_ERROR_EMPTY_DELIMITER,
228 SPLIT_ERROR_CELL_ELEMENT,
229 SPLIT_ERROR_OPTION_VALUE,
230 SPLIT_ERROR_INTERNAL,
231];
232
233pub const SPLIT_DESCRIPTOR: BuiltinDescriptor = BuiltinDescriptor {
234 signatures: &SPLIT_SIGNATURES,
235 output_mode: BuiltinOutputMode::Fixed,
236 completion_policy: BuiltinCompletionPolicy::Public,
237 errors: &SPLIT_ERRORS,
238};
239
240const STRSPLIT_OUTPUT: [BuiltinParamDescriptor; 2] = [
241 BuiltinParamDescriptor {
242 name: "parts",
243 ty: BuiltinParamType::Any,
244 arity: BuiltinParamArity::Required,
245 default: None,
246 description: "Split tokens.",
247 },
248 BuiltinParamDescriptor {
249 name: "matches",
250 ty: BuiltinParamType::Any,
251 arity: BuiltinParamArity::Optional,
252 default: None,
253 description: "Matched delimiters when requested as second output.",
254 },
255];
256
257const STRSPLIT_INPUTS_BASE: [BuiltinParamDescriptor; 1] = [BuiltinParamDescriptor {
258 name: "str",
259 ty: BuiltinParamType::Any,
260 arity: BuiltinParamArity::Required,
261 default: None,
262 description: "String scalar or character vector input.",
263}];
264
265const STRSPLIT_INPUTS_DELIMITER: [BuiltinParamDescriptor; 2] = [
266 BuiltinParamDescriptor {
267 name: "str",
268 ty: BuiltinParamType::Any,
269 arity: BuiltinParamArity::Required,
270 default: None,
271 description: "String scalar or character vector input.",
272 },
273 BuiltinParamDescriptor {
274 name: "delimiter",
275 ty: BuiltinParamType::Any,
276 arity: BuiltinParamArity::Required,
277 default: None,
278 description: "Delimiter scalar/array/cell.",
279 },
280];
281
282const STRSPLIT_INPUTS_DELIMITER_NAMEVALUE: [BuiltinParamDescriptor; 4] = [
283 BuiltinParamDescriptor {
284 name: "str",
285 ty: BuiltinParamType::Any,
286 arity: BuiltinParamArity::Required,
287 default: None,
288 description: "String scalar or character vector input.",
289 },
290 BuiltinParamDescriptor {
291 name: "delimiter",
292 ty: BuiltinParamType::Any,
293 arity: BuiltinParamArity::Required,
294 default: None,
295 description: "Delimiter scalar/array/cell.",
296 },
297 BuiltinParamDescriptor {
298 name: "Name",
299 ty: BuiltinParamType::StringScalar,
300 arity: BuiltinParamArity::Required,
301 default: None,
302 description: "Option name (`CollapseDelimiters` or `DelimiterType`).",
303 },
304 BuiltinParamDescriptor {
305 name: "Value",
306 ty: BuiltinParamType::Any,
307 arity: BuiltinParamArity::Variadic,
308 default: None,
309 description: "Option values and additional Name/Value pairs.",
310 },
311];
312
313const STRSPLIT_INPUTS_NAMEVALUE: [BuiltinParamDescriptor; 3] = [
314 BuiltinParamDescriptor {
315 name: "str",
316 ty: BuiltinParamType::Any,
317 arity: BuiltinParamArity::Required,
318 default: None,
319 description: "String scalar or character vector input.",
320 },
321 BuiltinParamDescriptor {
322 name: "Name",
323 ty: BuiltinParamType::StringScalar,
324 arity: BuiltinParamArity::Required,
325 default: None,
326 description: "Option name (`CollapseDelimiters` or `DelimiterType`).",
327 },
328 BuiltinParamDescriptor {
329 name: "Value",
330 ty: BuiltinParamType::Any,
331 arity: BuiltinParamArity::Variadic,
332 default: None,
333 description: "Option values and additional Name/Value pairs.",
334 },
335];
336
337const STRSPLIT_SIGNATURES: [BuiltinSignatureDescriptor; 4] = [
338 BuiltinSignatureDescriptor {
339 label: "[parts, matches] = strsplit(str)",
340 inputs: &STRSPLIT_INPUTS_BASE,
341 outputs: &STRSPLIT_OUTPUT,
342 },
343 BuiltinSignatureDescriptor {
344 label: "[parts, matches] = strsplit(str, delimiter)",
345 inputs: &STRSPLIT_INPUTS_DELIMITER,
346 outputs: &STRSPLIT_OUTPUT,
347 },
348 BuiltinSignatureDescriptor {
349 label: "[parts, matches] = strsplit(str, delimiter, Name, Value, ...)",
350 inputs: &STRSPLIT_INPUTS_DELIMITER_NAMEVALUE,
351 outputs: &STRSPLIT_OUTPUT,
352 },
353 BuiltinSignatureDescriptor {
354 label: "[parts, matches] = strsplit(str, Name, Value, ...)",
355 inputs: &STRSPLIT_INPUTS_NAMEVALUE,
356 outputs: &STRSPLIT_OUTPUT,
357 },
358];
359
360const STRSPLIT_ERROR_INVALID_INPUT: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
361 code: "RM.STRSPLIT.INVALID_INPUT",
362 identifier: Some("RunMat:strsplit:InvalidInput"),
363 when: "First argument is not a string scalar or character vector.",
364 message: "strsplit: first argument must be a string scalar or character vector",
365};
366
367const STRSPLIT_ERROR_DELIMITER_TYPE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
368 code: "RM.STRSPLIT.DELIMITER_TYPE",
369 identifier: Some("RunMat:strsplit:DelimiterType"),
370 when: "Delimiter input is not a supported text scalar/array/cell.",
371 message:
372 "strsplit: delimiter must be a character vector, string scalar, string array, or cell array of character vectors",
373};
374
375const STRSPLIT_ERROR_NAME_VALUE_PAIR: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
376 code: "RM.STRSPLIT.NAME_VALUE_PAIR",
377 identifier: Some("RunMat:strsplit:NameValuePair"),
378 when: "Name-value options are not supplied in complete pairs.",
379 message: "strsplit: name-value arguments must be supplied in pairs",
380};
381
382const STRSPLIT_ERROR_UNKNOWN_NAME: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
383 code: "RM.STRSPLIT.UNKNOWN_NAME",
384 identifier: Some("RunMat:strsplit:UnknownName"),
385 when: "An option name is not recognized.",
386 message:
387 "strsplit: unrecognized name-value argument; supported names are 'CollapseDelimiters' and 'DelimiterType'",
388};
389
390const STRSPLIT_ERROR_EMPTY_DELIMITER: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
391 code: "RM.STRSPLIT.EMPTY_DELIMITER",
392 identifier: Some("RunMat:strsplit:EmptyDelimiter"),
393 when: "Delimiter list is empty or contains empty delimiter entries.",
394 message: "strsplit: delimiters must contain at least one character",
395};
396
397const STRSPLIT_ERROR_DELIMITER_MODE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
398 code: "RM.STRSPLIT.DELIMITER_MODE",
399 identifier: Some("RunMat:strsplit:DelimiterMode"),
400 when: "DelimiterType option is not `Simple` or `RegularExpression`.",
401 message: "strsplit: value for 'DelimiterType' must be 'Simple' or 'RegularExpression'",
402};
403
404const STRSPLIT_ERROR_OPTION_VALUE: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
405 code: "RM.STRSPLIT.OPTION_VALUE",
406 identifier: Some("RunMat:strsplit:OptionValue"),
407 when: "Option values are not logical true/false values.",
408 message: "strsplit: option values must be logical true or false",
409};
410
411const STRSPLIT_ERROR_REGEX_INVALID: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
412 code: "RM.STRSPLIT.REGEX_INVALID",
413 identifier: Some("RunMat:strsplit:RegexInvalid"),
414 when: "Regular expression delimiter pattern fails to compile.",
415 message: "strsplit: invalid delimiter regular expression",
416};
417
418const STRSPLIT_ERROR_INTERNAL: BuiltinErrorDescriptor = BuiltinErrorDescriptor {
419 code: "RM.STRSPLIT.INTERNAL",
420 identifier: Some("RunMat:strsplit:InternalError"),
421 when: "Internal output container construction failed.",
422 message: "strsplit: internal error",
423};
424
425const STRSPLIT_ERRORS: [BuiltinErrorDescriptor; 9] = [
426 STRSPLIT_ERROR_INVALID_INPUT,
427 STRSPLIT_ERROR_DELIMITER_TYPE,
428 STRSPLIT_ERROR_NAME_VALUE_PAIR,
429 STRSPLIT_ERROR_UNKNOWN_NAME,
430 STRSPLIT_ERROR_EMPTY_DELIMITER,
431 STRSPLIT_ERROR_DELIMITER_MODE,
432 STRSPLIT_ERROR_OPTION_VALUE,
433 STRSPLIT_ERROR_REGEX_INVALID,
434 STRSPLIT_ERROR_INTERNAL,
435];
436
437pub const STRSPLIT_DESCRIPTOR: BuiltinDescriptor = BuiltinDescriptor {
438 signatures: &STRSPLIT_SIGNATURES,
439 output_mode: BuiltinOutputMode::ByRequestedOutputCount,
440 completion_policy: BuiltinCompletionPolicy::Public,
441 errors: &STRSPLIT_ERRORS,
442};
443
444fn map_flow(err: RuntimeError) -> RuntimeError {
445 map_control_flow_with_builtin(err, BUILTIN_NAME)
446}
447
448fn split_error_with_message(
449 message: impl Into<String>,
450 error: &'static BuiltinErrorDescriptor,
451) -> RuntimeError {
452 let mut builder = build_runtime_error(message).with_builtin(BUILTIN_NAME);
453 if let Some(identifier) = error.identifier {
454 builder = builder.with_identifier(identifier);
455 }
456 builder.build()
457}
458
459fn split_error(error: &'static BuiltinErrorDescriptor) -> RuntimeError {
460 split_error_with_message(error.message, error)
461}
462
463fn strsplit_error_with_message(
464 message: impl Into<String>,
465 error: &'static BuiltinErrorDescriptor,
466) -> RuntimeError {
467 let mut builder = build_runtime_error(message).with_builtin(STRSPLIT_BUILTIN_NAME);
468 if let Some(identifier) = error.identifier {
469 builder = builder.with_identifier(identifier);
470 }
471 builder.build()
472}
473
474fn strsplit_error(error: &'static BuiltinErrorDescriptor) -> RuntimeError {
475 strsplit_error_with_message(error.message, error)
476}
477
478#[runtime_builtin(
479 name = "split",
480 category = "strings/transform",
481 summary = "Split text inputs into substrings using delimiter rules.",
482 keywords = "split,strsplit,delimiter,CollapseDelimiters,IncludeDelimiters",
483 accel = "sink",
484 type_resolver(string_array_type),
485 descriptor(crate::builtins::strings::transform::split::SPLIT_DESCRIPTOR),
486 builtin_path = "crate::builtins::strings::transform::split"
487)]
488async fn split_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
489 let text = gather_if_needed_async(&text).await.map_err(map_flow)?;
490 let mut args: Vec<Value> = Vec::with_capacity(rest.len());
491 for arg in rest {
492 args.push(gather_if_needed_async(&arg).await.map_err(map_flow)?);
493 }
494
495 let options = SplitOptions::parse(&args)?;
496 let matrix = TextMatrix::from_value(text)?;
497 matrix.into_split_result(&options)
498}
499
500#[runtime_builtin(
501 name = "strsplit",
502 category = "strings/transform",
503 summary = "Split scalar text into substrings using simple or regex delimiters.",
504 keywords = "strsplit,split,delimiter,CollapseDelimiters,DelimiterType,matches",
505 accel = "sink",
506 type_resolver(unknown_type),
507 descriptor(crate::builtins::strings::transform::split::STRSPLIT_DESCRIPTOR),
508 builtin_path = "crate::builtins::strings::transform::split"
509)]
510async fn strsplit_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
511 let text = gather_if_needed_async(&text)
512 .await
513 .map_err(|err| map_control_flow_with_builtin(err, STRSPLIT_BUILTIN_NAME))?;
514 let mut args = Vec::with_capacity(rest.len());
515 for arg in rest {
516 args.push(
517 gather_if_needed_async(&arg)
518 .await
519 .map_err(|err| map_control_flow_with_builtin(err, STRSPLIT_BUILTIN_NAME))?,
520 );
521 }
522
523 let (input_kind, subject) = extract_strsplit_subject(text)?;
524 let options = StrsplitOptions::parse(&args)?;
525 let (parts, matches) = strsplit_text(&subject, &options)?;
526 let parts_value = make_strsplit_output(parts, input_kind)?;
527
528 if let Some(out_count) = crate::output_count::current_output_count() {
529 if out_count == 0 {
530 return Ok(Value::OutputList(Vec::new()));
531 }
532 let matches_value = make_strsplit_output(matches, input_kind)?;
533 return Ok(crate::output_count::output_list_with_padding(
534 out_count,
535 vec![parts_value, matches_value],
536 ));
537 }
538
539 Ok(parts_value)
540}
541
542#[derive(Clone)]
543enum DelimiterSpec {
544 Whitespace,
545 Patterns(Vec<String>),
546}
547
548#[derive(Clone)]
549struct SplitOptions {
550 delimiters: DelimiterSpec,
551 collapse_delimiters: bool,
552 include_delimiters: bool,
553}
554
555impl SplitOptions {
556 fn parse(args: &[Value]) -> BuiltinResult<Self> {
557 let mut index = 0usize;
558 let mut delimiters = DelimiterSpec::Whitespace;
559
560 if index < args.len() && !is_name_key(&args[index]) {
561 let list = extract_delimiters(&args[index])?;
562 if list.is_empty() {
563 return Err(split_error(&SPLIT_ERROR_EMPTY_DELIMITER));
564 }
565 let mut seen = HashSet::new();
566 let mut patterns: Vec<String> = Vec::new();
567 for pattern in list {
568 if pattern.is_empty() {
569 return Err(split_error(&SPLIT_ERROR_EMPTY_DELIMITER));
570 }
571 if seen.insert(pattern.clone()) {
572 patterns.push(pattern);
573 }
574 }
575 patterns.sort_by_key(|pat| std::cmp::Reverse(pat.len()));
576 delimiters = DelimiterSpec::Patterns(patterns);
577 index += 1;
578 }
579
580 let mut collapse = match delimiters {
581 DelimiterSpec::Whitespace => true,
582 DelimiterSpec::Patterns(_) => false,
583 };
584 let mut include = false;
585
586 while index < args.len() {
587 let name = match name_key(&args[index]) {
588 Some(NameKey::CollapseDelimiters) => NameKey::CollapseDelimiters,
589 Some(NameKey::IncludeDelimiters) => NameKey::IncludeDelimiters,
590 None => return Err(split_error(&SPLIT_ERROR_UNKNOWN_NAME)),
591 };
592 index += 1;
593 if index >= args.len() {
594 return Err(split_error(&SPLIT_ERROR_NAME_VALUE_PAIR));
595 }
596 let value = &args[index];
597 index += 1;
598
599 match name {
600 NameKey::CollapseDelimiters => {
601 collapse = parse_bool(value, "CollapseDelimiters")?;
602 }
603 NameKey::IncludeDelimiters => {
604 include = parse_bool(value, "IncludeDelimiters")?;
605 }
606 }
607 }
608
609 Ok(Self {
610 delimiters,
611 collapse_delimiters: collapse,
612 include_delimiters: include,
613 })
614 }
615}
616
617struct TextMatrix {
618 data: Vec<String>,
619 rows: usize,
620 cols: usize,
621}
622
623impl TextMatrix {
624 fn from_value(value: Value) -> BuiltinResult<Self> {
625 match value {
626 Value::String(text) => Ok(Self {
627 data: vec![text],
628 rows: 1,
629 cols: 1,
630 }),
631 Value::StringArray(array) => Ok(Self {
632 data: array.data,
633 rows: array.rows,
634 cols: array.cols,
635 }),
636 Value::CharArray(array) => Self::from_char_array(array),
637 Value::Cell(cell) => Self::from_cell_array(cell),
638 _ => Err(split_error(&SPLIT_ERROR_INVALID_INPUT)),
639 }
640 }
641
642 fn from_char_array(array: CharArray) -> BuiltinResult<Self> {
643 let CharArray { data, rows, cols } = array;
644 if rows == 0 {
645 return Ok(Self {
646 data: Vec::new(),
647 rows: 0,
648 cols: 1,
649 });
650 }
651 let mut strings = Vec::with_capacity(rows);
652 for row in 0..rows {
653 strings.push(char_row_to_string_slice(&data, cols, row));
654 }
655 Ok(Self {
656 data: strings,
657 rows,
658 cols: 1,
659 })
660 }
661
662 fn from_cell_array(cell: CellArray) -> BuiltinResult<Self> {
663 let CellArray {
664 data, rows, cols, ..
665 } = cell;
666 let mut strings = Vec::with_capacity(data.len());
667 for col in 0..cols {
668 for row in 0..rows {
669 let idx = row * cols + col;
670 let value_ref: &Value = &data[idx];
671 strings.push(
672 cell_element_to_string(value_ref)
673 .ok_or_else(|| split_error(&SPLIT_ERROR_CELL_ELEMENT))?,
674 );
675 }
676 }
677 Ok(Self {
678 data: strings,
679 rows,
680 cols,
681 })
682 }
683
684 fn into_split_result(self, options: &SplitOptions) -> BuiltinResult<Value> {
685 let TextMatrix { data, rows, cols } = self;
686
687 if data.is_empty() {
688 let block_cols = if cols == 0 { 0 } else { 1 };
689 let shape = if cols == 0 {
690 vec![rows, 0]
691 } else {
692 vec![rows, cols * block_cols]
693 };
694 let array = StringArray::new(Vec::new(), shape).map_err(|e| {
695 split_error_with_message(format!("{BUILTIN_NAME}: {e}"), &SPLIT_ERROR_INTERNAL)
696 })?;
697 return Ok(Value::StringArray(array));
698 }
699
700 let mut per_element: Vec<Vec<String>> = Vec::with_capacity(data.len());
701 let mut max_tokens = 0usize;
702 for text in &data {
703 let tokens = split_text(text, options);
704 max_tokens = max_tokens.max(tokens.len());
705 per_element.push(tokens);
706 }
707 if max_tokens == 0 {
708 max_tokens = 1;
709 }
710 let block_cols = max_tokens;
711 let result_cols = block_cols * cols.max(1);
712 let total = rows * result_cols;
713 let missing = "<missing>".to_string();
714 let mut output = vec![missing.clone(); total];
715
716 for col in 0..cols.max(1) {
717 for row in 0..rows {
718 let element_index = if cols == 0 { row } else { row + col * rows };
719 if element_index >= per_element.len() {
720 continue;
721 }
722 let tokens = &per_element[element_index];
723 for t in 0..block_cols {
724 let out_col = if cols == 0 { t } else { col * block_cols + t };
725 let out_index = row + out_col * rows;
726 if out_index >= output.len() {
727 continue;
728 }
729 if t < tokens.len() {
730 output[out_index] = tokens[t].clone();
731 } else {
732 output[out_index] = missing.clone();
733 }
734 }
735 }
736 }
737
738 let shape = vec![rows, result_cols];
739 let array = StringArray::new(output, shape).map_err(|e| {
740 split_error_with_message(format!("{BUILTIN_NAME}: {e}"), &SPLIT_ERROR_INTERNAL)
741 })?;
742 Ok(Value::StringArray(array))
743 }
744}
745
746fn split_text(text: &str, options: &SplitOptions) -> Vec<String> {
747 if is_missing_string(text) {
748 return vec![text.to_string()];
749 }
750 match &options.delimiters {
751 DelimiterSpec::Whitespace => split_whitespace(text, options),
752 DelimiterSpec::Patterns(patterns) => split_by_patterns(text, patterns, options),
753 }
754}
755
756fn split_whitespace(text: &str, options: &SplitOptions) -> Vec<String> {
757 if text.is_empty() {
758 return vec![String::new()];
759 }
760
761 let mut parts: Vec<String> = Vec::new();
762 let mut idx = 0usize;
763 let mut last = 0usize;
764 let len = text.len();
765
766 while idx < len {
767 let ch = text[idx..].chars().next().unwrap();
768 let width = ch.len_utf8();
769 if !ch.is_whitespace() {
770 idx += width;
771 continue;
772 }
773
774 let token = &text[last..idx];
775 if !token.is_empty() || !options.collapse_delimiters {
776 parts.push(token.to_string());
777 }
778
779 let run_end = advance_whitespace(text, idx);
780 if options.include_delimiters {
781 if options.collapse_delimiters {
782 parts.push(text[idx..run_end].to_string());
783 } else {
784 parts.push(text[idx..idx + width].to_string());
785 }
786 }
787
788 if options.collapse_delimiters {
789 idx = run_end;
790 last = run_end;
791 } else {
792 idx += width;
793 last = idx;
794 }
795 }
796
797 let tail = &text[last..];
798 if !tail.is_empty() || !options.collapse_delimiters {
799 parts.push(tail.to_string());
800 }
801 if parts.is_empty() {
802 parts.push(String::new());
803 }
804 parts
805}
806
807fn split_by_patterns(text: &str, patterns: &[String], options: &SplitOptions) -> Vec<String> {
808 if patterns.is_empty() {
809 return vec![text.to_string()];
810 }
811
812 let mut parts: Vec<String> = Vec::new();
813 let mut idx = 0usize;
814 let mut last = 0usize;
815 while idx < text.len() {
816 if let Some(pattern) = patterns
817 .iter()
818 .find(|candidate| text[idx..].starts_with(candidate.as_str()))
819 {
820 let token = &text[last..idx];
821 if !token.is_empty() || !options.collapse_delimiters {
822 parts.push(token.to_string());
823 }
824
825 let pat_len = pattern.len();
826 if options.collapse_delimiters {
827 let mut run_end = idx + pat_len;
828 while run_end < text.len() {
829 if let Some(next) = patterns
830 .iter()
831 .find(|candidate| text[run_end..].starts_with(candidate.as_str()))
832 {
833 let len = next.len();
834 if len == 0 {
835 break;
836 }
837 run_end += len;
838 } else {
839 break;
840 }
841 }
842 if options.include_delimiters {
843 parts.push(text[idx..run_end].to_string());
844 }
845 idx = run_end;
846 last = run_end;
847 } else {
848 if options.include_delimiters {
849 parts.push(text[idx..idx + pat_len].to_string());
850 }
851 idx += pat_len;
852 last = idx;
853 }
854
855 continue;
856 }
857 let ch = text[idx..].chars().next().unwrap();
858 idx += ch.len_utf8();
859 }
860 let tail = &text[last..];
861 if !tail.is_empty() || !options.collapse_delimiters {
862 parts.push(tail.to_string());
863 }
864 if parts.is_empty() {
865 parts.push(String::new());
866 }
867 parts
868}
869
870fn advance_whitespace(text: &str, mut start: usize) -> usize {
871 while start < text.len() {
872 let ch = text[start..].chars().next().unwrap();
873 if !ch.is_whitespace() {
874 break;
875 }
876 start += ch.len_utf8();
877 }
878 start
879}
880
881fn extract_delimiters(value: &Value) -> BuiltinResult<Vec<String>> {
882 match value {
883 Value::String(text) => Ok(vec![text.clone()]),
884 Value::StringArray(array) => Ok(array.data.clone()),
885 Value::CharArray(array) => {
886 if array.rows == 0 {
887 return Ok(Vec::new());
888 }
889 let mut entries = Vec::with_capacity(array.rows);
890 for row in 0..array.rows {
891 entries.push(char_row_to_string_slice(&array.data, array.cols, row));
892 }
893 Ok(entries)
894 }
895 Value::Cell(cell) => {
896 let mut entries = Vec::with_capacity(cell.data.len());
897 for element in &cell.data {
898 entries.push(
899 cell_element_to_string(element)
900 .ok_or_else(|| split_error(&SPLIT_ERROR_CELL_ELEMENT))?,
901 );
902 }
903 Ok(entries)
904 }
905 _ => Err(split_error(&SPLIT_ERROR_DELIMITER_TYPE)),
906 }
907}
908
909fn cell_element_to_string(value: &Value) -> Option<String> {
910 match value {
911 Value::String(text) => Some(text.clone()),
912 Value::StringArray(array) if array.data.len() == 1 => Some(array.data[0].clone()),
913 Value::CharArray(array) if array.rows <= 1 => {
914 if array.rows == 0 {
915 Some(String::new())
916 } else {
917 Some(char_row_to_string_slice(&array.data, array.cols, 0))
918 }
919 }
920 _ => None,
921 }
922}
923
924fn value_to_scalar_string(value: &Value) -> Option<String> {
925 match value {
926 Value::String(text) => Some(text.clone()),
927 Value::StringArray(array) if array.data.len() == 1 => Some(array.data[0].clone()),
928 Value::CharArray(array) if array.rows <= 1 => {
929 if array.rows == 0 {
930 Some(String::new())
931 } else {
932 Some(char_row_to_string_slice(&array.data, array.cols, 0))
933 }
934 }
935 Value::Cell(cell) if cell.data.len() == 1 => cell_element_to_string(&cell.data[0]),
936 _ => None,
937 }
938}
939
940fn parse_bool(value: &Value, name: &str) -> BuiltinResult<bool> {
941 parse_bool_for_builtin(value, name, BUILTIN_NAME, &SPLIT_ERROR_OPTION_VALUE)
942}
943
944fn parse_bool_for_builtin(
945 value: &Value,
946 name: &str,
947 builtin_name: &'static str,
948 error: &'static BuiltinErrorDescriptor,
949) -> BuiltinResult<bool> {
950 match value {
951 Value::Bool(b) => Ok(*b),
952 Value::Int(i) => Ok(i.to_i64() != 0),
953 Value::Num(n) => Ok(*n != 0.0),
954 Value::LogicalArray(array) => {
955 if array.data.len() == 1 {
956 Ok(array.data[0] != 0)
957 } else {
958 Err(builtin_error_with_descriptor(
959 builtin_name,
960 format!(
961 "{builtin_name}: value for '{}' must be logical true or false",
962 name
963 ),
964 error,
965 ))
966 }
967 }
968 Value::Tensor(tensor) => {
969 if tensor.data.len() == 1 {
970 Ok(tensor.data[0] != 0.0)
971 } else {
972 Err(builtin_error_with_descriptor(
973 builtin_name,
974 format!(
975 "{builtin_name}: value for '{}' must be logical true or false",
976 name
977 ),
978 error,
979 ))
980 }
981 }
982 _ => {
983 if let Some(text) = value_to_scalar_string(value) {
984 let lowered = text.trim().to_ascii_lowercase();
985 match lowered.as_str() {
986 "true" | "on" | "yes" => Ok(true),
987 "false" | "off" | "no" => Ok(false),
988 _ => Err(builtin_error_with_descriptor(
989 builtin_name,
990 format!(
991 "{builtin_name}: value for '{}' must be logical true or false",
992 name
993 ),
994 error,
995 )),
996 }
997 } else {
998 Err(builtin_error_with_descriptor(
999 builtin_name,
1000 format!(
1001 "{builtin_name}: value for '{}' must be logical true or false",
1002 name
1003 ),
1004 error,
1005 ))
1006 }
1007 }
1008 }
1009}
1010
1011fn builtin_error_with_descriptor(
1012 builtin_name: &'static str,
1013 message: impl Into<String>,
1014 error: &'static BuiltinErrorDescriptor,
1015) -> RuntimeError {
1016 let mut builder = build_runtime_error(message).with_builtin(builtin_name);
1017 if let Some(identifier) = error.identifier {
1018 builder = builder.with_identifier(identifier);
1019 }
1020 builder.build()
1021}
1022
1023fn extract_strsplit_subject(value: Value) -> BuiltinResult<(StrsplitInputKind, String)> {
1024 match value {
1025 Value::String(text) => Ok((StrsplitInputKind::String, text)),
1026 Value::StringArray(array) if array.data.len() == 1 => {
1027 Ok((StrsplitInputKind::String, array.data[0].clone()))
1028 }
1029 Value::CharArray(array) if array.rows <= 1 => {
1030 if array.rows == 0 {
1031 Ok((StrsplitInputKind::Char, String::new()))
1032 } else {
1033 Ok((
1034 StrsplitInputKind::Char,
1035 char_row_to_string_slice(&array.data, array.cols, 0),
1036 ))
1037 }
1038 }
1039 _ => Err(strsplit_error(&STRSPLIT_ERROR_INVALID_INPUT)),
1040 }
1041}
1042
1043fn strsplit_text(
1044 text: &str,
1045 options: &StrsplitOptions,
1046) -> BuiltinResult<(Vec<String>, Vec<String>)> {
1047 let regex = compile_strsplit_regex(options)?;
1048 let mut parts = Vec::new();
1049 let mut matches = Vec::new();
1050 let mut last = 0usize;
1051
1052 for found in regex.find_iter(text) {
1053 parts.push(text[last..found.start()].to_string());
1054 matches.push(found.as_str().to_string());
1055 last = found.end();
1056 }
1057
1058 parts.push(text[last..].to_string());
1059 Ok((parts, matches))
1060}
1061
1062fn compile_strsplit_regex(options: &StrsplitOptions) -> BuiltinResult<regex::Regex> {
1063 let pattern = match (&options.delimiters, options.delimiter_type) {
1064 (None, _) => {
1065 if options.collapse_delimiters {
1066 "[\\x20\\x0C\\n\\r\\t\\x0B]+".to_string()
1067 } else {
1068 "[\\x20\\x0C\\n\\r\\t\\x0B]".to_string()
1069 }
1070 }
1071 (Some(delimiters), StrsplitDelimiterType::Simple) => {
1072 let alternation = delimiters
1073 .iter()
1074 .map(|pattern| regex::escape(pattern))
1075 .collect::<Vec<_>>()
1076 .join("|");
1077 if options.collapse_delimiters {
1078 format!("(?:{alternation})+")
1079 } else {
1080 format!("(?:{alternation})")
1081 }
1082 }
1083 (Some(delimiters), StrsplitDelimiterType::RegularExpression) => {
1084 let alternation = delimiters.join("|");
1085 if options.collapse_delimiters {
1086 format!("(?:{alternation})+")
1087 } else {
1088 format!("(?:{alternation})")
1089 }
1090 }
1091 };
1092
1093 RegexBuilder::new(&pattern).build().map_err(|err| {
1094 strsplit_error_with_message(format!("strsplit: {err}"), &STRSPLIT_ERROR_REGEX_INVALID)
1095 })
1096}
1097
1098fn make_strsplit_output(tokens: Vec<String>, kind: StrsplitInputKind) -> BuiltinResult<Value> {
1099 match kind {
1100 StrsplitInputKind::String => {
1101 let len = tokens.len();
1102 let array = StringArray::new(tokens, vec![1, len]).map_err(|err| {
1103 strsplit_error_with_message(format!("strsplit: {err}"), &STRSPLIT_ERROR_INTERNAL)
1104 })?;
1105 Ok(Value::StringArray(array))
1106 }
1107 StrsplitInputKind::Char => {
1108 let values: Vec<Value> = tokens.into_iter().map(Value::String).collect();
1109 let len = values.len();
1110 make_cell(values, 1, len).map_err(|err| {
1111 strsplit_error_with_message(format!("strsplit: {err}"), &STRSPLIT_ERROR_INTERNAL)
1112 })
1113 }
1114 }
1115}
1116
1117#[derive(PartialEq, Eq)]
1118enum NameKey {
1119 CollapseDelimiters,
1120 IncludeDelimiters,
1121}
1122
1123#[derive(Clone, Copy)]
1124enum StrsplitInputKind {
1125 Char,
1126 String,
1127}
1128
1129#[derive(Clone, Copy)]
1130enum StrsplitDelimiterType {
1131 Simple,
1132 RegularExpression,
1133}
1134
1135#[derive(Clone)]
1136struct StrsplitOptions {
1137 delimiters: Option<Vec<String>>,
1138 collapse_delimiters: bool,
1139 delimiter_type: StrsplitDelimiterType,
1140}
1141
1142impl StrsplitOptions {
1143 fn parse(args: &[Value]) -> BuiltinResult<Self> {
1144 let mut index = 0usize;
1145 let mut delimiters = None;
1146
1147 if index < args.len() && !is_strsplit_name_key(&args[index]) {
1148 let list = extract_delimiters(&args[index])
1149 .map_err(|_| strsplit_error(&STRSPLIT_ERROR_DELIMITER_TYPE))?;
1150 delimiters = Some(list);
1151 index += 1;
1152 }
1153
1154 let mut collapse_delimiters = true;
1155 let mut delimiter_type = StrsplitDelimiterType::Simple;
1156
1157 while index < args.len() {
1158 let name = match strsplit_name_key(&args[index]) {
1159 Some(name) => name,
1160 None => return Err(strsplit_error(&STRSPLIT_ERROR_UNKNOWN_NAME)),
1161 };
1162 index += 1;
1163 if index >= args.len() {
1164 return Err(strsplit_error(&STRSPLIT_ERROR_NAME_VALUE_PAIR));
1165 }
1166 let value = &args[index];
1167 index += 1;
1168
1169 match name {
1170 StrsplitNameKey::CollapseDelimiters => {
1171 collapse_delimiters = parse_bool_for_builtin(
1172 value,
1173 "CollapseDelimiters",
1174 STRSPLIT_BUILTIN_NAME,
1175 &STRSPLIT_ERROR_OPTION_VALUE,
1176 )?;
1177 }
1178 StrsplitNameKey::DelimiterType => {
1179 let text = value_to_scalar_string(value)
1180 .ok_or_else(|| strsplit_error(&STRSPLIT_ERROR_DELIMITER_MODE))?;
1181 delimiter_type = match text.trim().to_ascii_lowercase().as_str() {
1182 "simple" => StrsplitDelimiterType::Simple,
1183 "regularexpression" => StrsplitDelimiterType::RegularExpression,
1184 _ => return Err(strsplit_error(&STRSPLIT_ERROR_DELIMITER_MODE)),
1185 };
1186 }
1187 }
1188 }
1189
1190 if let Some(patterns) = &delimiters {
1191 if patterns.is_empty() {
1192 return Err(strsplit_error(&STRSPLIT_ERROR_EMPTY_DELIMITER));
1193 }
1194 if matches!(delimiter_type, StrsplitDelimiterType::Simple)
1195 && patterns.iter().any(|pattern| pattern.is_empty())
1196 {
1197 return Err(strsplit_error(&STRSPLIT_ERROR_EMPTY_DELIMITER));
1198 }
1199 }
1200
1201 Ok(Self {
1202 delimiters,
1203 collapse_delimiters,
1204 delimiter_type,
1205 })
1206 }
1207}
1208
1209#[derive(PartialEq, Eq)]
1210enum StrsplitNameKey {
1211 CollapseDelimiters,
1212 DelimiterType,
1213}
1214
1215fn is_name_key(value: &Value) -> bool {
1216 name_key(value).is_some()
1217}
1218
1219fn is_strsplit_name_key(value: &Value) -> bool {
1220 strsplit_name_key(value).is_some()
1221}
1222
1223fn name_key(value: &Value) -> Option<NameKey> {
1224 value_to_scalar_string(value).and_then(|text| {
1225 let lowered = text.trim().to_ascii_lowercase();
1226 match lowered.as_str() {
1227 "collapsedelimiters" => Some(NameKey::CollapseDelimiters),
1228 "includedelimiters" => Some(NameKey::IncludeDelimiters),
1229 _ => None,
1230 }
1231 })
1232}
1233
1234fn strsplit_name_key(value: &Value) -> Option<StrsplitNameKey> {
1235 value_to_scalar_string(value).and_then(|text| {
1236 let lowered = text.trim().to_ascii_lowercase();
1237 match lowered.as_str() {
1238 "collapsedelimiters" => Some(StrsplitNameKey::CollapseDelimiters),
1239 "delimitertype" => Some(StrsplitNameKey::DelimiterType),
1240 _ => None,
1241 }
1242 })
1243}
1244
1245#[cfg(test)]
1246pub(crate) mod tests {
1247 use super::*;
1248 use runmat_builtins::{CellArray, LogicalArray, ResolveContext, Tensor, Type};
1249
1250 fn split_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
1251 futures::executor::block_on(super::split_builtin(text, rest))
1252 }
1253
1254 fn strsplit_builtin(text: Value, rest: Vec<Value>) -> BuiltinResult<Value> {
1255 futures::executor::block_on(super::strsplit_builtin(text, rest))
1256 }
1257
1258 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1259 #[test]
1260 fn split_string_whitespace_default() {
1261 let input = Value::String("RunMat Accelerate Planner".to_string());
1262 let result = split_builtin(input, Vec::new()).expect("split");
1263 match result {
1264 Value::StringArray(array) => {
1265 assert_eq!(array.shape, vec![1, 3]);
1266 assert_eq!(
1267 array.data,
1268 vec![
1269 "RunMat".to_string(),
1270 "Accelerate".to_string(),
1271 "Planner".to_string()
1272 ]
1273 );
1274 }
1275 other => panic!("expected string array, got {other:?}"),
1276 }
1277 }
1278
1279 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1280 #[test]
1281 fn split_string_custom_delimiter() {
1282 let input = Value::String("alpha,beta,gamma".to_string());
1283 let args = vec![Value::String(",".to_string())];
1284 let result = split_builtin(input, args).expect("split");
1285 match result {
1286 Value::StringArray(array) => {
1287 assert_eq!(array.shape, vec![1, 3]);
1288 assert_eq!(
1289 array.data,
1290 vec!["alpha".to_string(), "beta".to_string(), "gamma".to_string()]
1291 );
1292 }
1293 other => panic!("expected string array, got {other:?}"),
1294 }
1295 }
1296
1297 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1298 #[test]
1299 fn split_include_delimiters_true() {
1300 let input = Value::String("A+B-C".to_string());
1301 let args = vec![
1302 Value::StringArray(
1303 StringArray::new(vec!["+".to_string(), "-".to_string()], vec![1, 2]).unwrap(),
1304 ),
1305 Value::String("IncludeDelimiters".to_string()),
1306 Value::Bool(true),
1307 ];
1308 let result = split_builtin(input, args).expect("split");
1309 match result {
1310 Value::StringArray(array) => {
1311 assert_eq!(array.shape, vec![1, 5]);
1312 assert_eq!(
1313 array.data,
1314 vec![
1315 "A".to_string(),
1316 "+".to_string(),
1317 "B".to_string(),
1318 "-".to_string(),
1319 "C".to_string()
1320 ]
1321 );
1322 }
1323 other => panic!("expected string array, got {other:?}"),
1324 }
1325 }
1326
1327 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1328 #[test]
1329 fn split_include_delimiters_whitespace_collapse_default() {
1330 let input = Value::String("A B".to_string());
1331 let args = vec![
1332 Value::String("IncludeDelimiters".to_string()),
1333 Value::Bool(true),
1334 ];
1335 let result = split_builtin(input, args).expect("split");
1336 match result {
1337 Value::StringArray(array) => {
1338 assert_eq!(array.shape, vec![1, 3]);
1339 assert_eq!(
1340 array.data,
1341 vec!["A".to_string(), " ".to_string(), "B".to_string()]
1342 );
1343 }
1344 other => panic!("expected string array, got {other:?}"),
1345 }
1346 }
1347
1348 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1349 #[test]
1350 fn split_patterns_include_delimiters_collapse_true() {
1351 let input = Value::String("a,,b".to_string());
1352 let args = vec![
1353 Value::String(",".to_string()),
1354 Value::String("IncludeDelimiters".to_string()),
1355 Value::Bool(true),
1356 Value::String("CollapseDelimiters".to_string()),
1357 Value::Bool(true),
1358 ];
1359 let result = split_builtin(input, args).expect("split");
1360 match result {
1361 Value::StringArray(array) => {
1362 assert_eq!(array.shape, vec![1, 3]);
1363 assert_eq!(
1364 array.data,
1365 vec!["a".to_string(), ",,".to_string(), "b".to_string()]
1366 );
1367 }
1368 other => panic!("expected string array, got {other:?}"),
1369 }
1370 }
1371
1372 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1373 #[test]
1374 fn split_collapse_false_preserves_empty_segments() {
1375 let input = Value::String("one,,three,".to_string());
1376 let args = vec![
1377 Value::String(",".to_string()),
1378 Value::String("CollapseDelimiters".to_string()),
1379 Value::Bool(false),
1380 ];
1381 let result = split_builtin(input, args).expect("split");
1382 match result {
1383 Value::StringArray(array) => {
1384 assert_eq!(array.shape, vec![1, 4]);
1385 assert_eq!(
1386 array.data,
1387 vec![
1388 "one".to_string(),
1389 "".to_string(),
1390 "three".to_string(),
1391 "".to_string()
1392 ]
1393 );
1394 }
1395 other => panic!("expected string array, got {other:?}"),
1396 }
1397 }
1398
1399 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1400 #[test]
1401 fn split_character_array_rows() {
1402 let mut row1: Vec<char> = "GPU Accelerate".chars().collect();
1403 let mut row2: Vec<char> = "VM Engine".chars().collect();
1404 let width = row1.len().max(row2.len());
1405 row1.resize(width, ' ');
1406 row2.resize(width, ' ');
1407 let mut data = row1;
1408 data.extend(row2);
1409 let char_array = CharArray::new(data, 2, width).unwrap();
1410 let input = Value::CharArray(char_array);
1411 let result = split_builtin(input, Vec::new()).expect("split");
1412 match result {
1413 Value::StringArray(array) => {
1414 assert_eq!(array.shape, vec![2, 2]);
1415 assert_eq!(
1416 array.data,
1417 vec![
1418 "GPU".to_string(),
1419 "VM".to_string(),
1420 "Accelerate".to_string(),
1421 "Engine".to_string()
1422 ]
1423 );
1424 }
1425 other => panic!("expected string array, got {other:?}"),
1426 }
1427 }
1428
1429 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1430 #[test]
1431 fn split_string_array_multiple_columns() {
1432 let data = vec![
1433 "RunMat Core".to_string(),
1434 "VM Interpreter".to_string(),
1435 "Accelerate Engine".to_string(),
1436 "<missing>".to_string(),
1437 ];
1438 let array = StringArray::new(data, vec![2, 2]).unwrap();
1439 let input = Value::StringArray(array);
1440 let result = split_builtin(input, Vec::new()).expect("split");
1441 match result {
1442 Value::StringArray(array) => {
1443 assert_eq!(array.shape, vec![2, 4]);
1444 assert_eq!(
1445 array.data,
1446 vec![
1447 "RunMat".to_string(),
1448 "VM".to_string(),
1449 "Core".to_string(),
1450 "Interpreter".to_string(),
1451 "Accelerate".to_string(),
1452 "<missing>".to_string(),
1453 "Engine".to_string(),
1454 "<missing>".to_string()
1455 ]
1456 );
1457 }
1458 other => panic!("expected string array, got {other:?}"),
1459 }
1460 }
1461
1462 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1463 #[test]
1464 fn split_cell_array_outputs_string_array() {
1465 let values = vec![
1466 Value::String("RunMat Snapshot".to_string()),
1467 Value::String("Fusion Planner".to_string()),
1468 ];
1469 let cell = crate::make_cell(values, 2, 1).expect("cell");
1470 let result = split_builtin(cell, vec![Value::String(" ".to_string())]).expect("split");
1471 match result {
1472 Value::StringArray(array) => {
1473 assert_eq!(array.shape, vec![2, 2]);
1474 assert_eq!(
1475 array.data,
1476 vec![
1477 "RunMat".to_string(),
1478 "Fusion".to_string(),
1479 "Snapshot".to_string(),
1480 "Planner".to_string()
1481 ]
1482 );
1483 }
1484 other => panic!("expected string array, got {other:?}"),
1485 }
1486 }
1487
1488 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1489 #[test]
1490 fn split_cell_array_multiple_columns() {
1491 let values = vec![
1492 Value::String("alpha beta".to_string()),
1493 Value::String("gamma".to_string()),
1494 Value::String("delta epsilon".to_string()),
1495 Value::String("<missing>".to_string()),
1496 ];
1497 let cell = crate::make_cell(values, 2, 2).expect("cell");
1498 let result = split_builtin(cell, Vec::new()).expect("split");
1499 match result {
1500 Value::StringArray(array) => {
1501 assert_eq!(array.shape, vec![2, 4]);
1502 assert_eq!(
1503 array.data,
1504 vec![
1505 "alpha".to_string(),
1506 "delta".to_string(),
1507 "beta".to_string(),
1508 "epsilon".to_string(),
1509 "gamma".to_string(),
1510 "<missing>".to_string(),
1511 "<missing>".to_string(),
1512 "<missing>".to_string()
1513 ]
1514 );
1515 }
1516 other => panic!("expected string array, got {other:?}"),
1517 }
1518 }
1519
1520 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1521 #[test]
1522 fn split_missing_string_propagates() {
1523 let input = Value::String("<missing>".to_string());
1524 let result = split_builtin(input, Vec::new()).expect("split");
1525 match result {
1526 Value::StringArray(array) => {
1527 assert_eq!(array.shape, vec![1, 1]);
1528 assert_eq!(array.data, vec!["<missing>".to_string()]);
1529 }
1530 other => panic!("expected string array, got {other:?}"),
1531 }
1532 }
1533
1534 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1535 #[test]
1536 fn split_invalid_name_value_pair_errors() {
1537 let input = Value::String("abc".to_string());
1538 let args = vec![Value::String("CollapseDelimiters".to_string())];
1539 let err = split_builtin(input, args).unwrap_err();
1540 assert!(err.to_string().contains("name-value"));
1541 }
1542
1543 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1544 #[test]
1545 fn split_invalid_text_argument_errors() {
1546 let err = split_builtin(Value::Num(1.0), Vec::new()).unwrap_err();
1547 assert!(err.to_string().contains("first argument"));
1548 }
1549
1550 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1551 #[test]
1552 fn split_invalid_delimiter_type_errors() {
1553 let err =
1554 split_builtin(Value::String("abc".to_string()), vec![Value::Num(1.0)]).unwrap_err();
1555 assert!(err.to_string().contains("delimiter input"));
1556 }
1557
1558 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1559 #[test]
1560 fn split_empty_delimiter_errors() {
1561 let err = split_builtin(
1562 Value::String("abc".to_string()),
1563 vec![Value::String(String::new())],
1564 )
1565 .unwrap_err();
1566 assert!(err.to_string().contains("at least one character"));
1567 }
1568
1569 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1570 #[test]
1571 fn split_unknown_name_argument_errors() {
1572 let err = split_builtin(
1573 Value::String("abc".to_string()),
1574 vec![
1575 Value::String("UnknownOption".to_string()),
1576 Value::Bool(true),
1577 ],
1578 )
1579 .unwrap_err();
1580 assert!(err.to_string().contains("unrecognized"));
1581 }
1582
1583 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1584 #[test]
1585 fn split_collapse_delimiters_accepts_logical_array() {
1586 let logical = LogicalArray::new(vec![1u8], vec![1]).unwrap();
1587 let args = vec![
1588 Value::String(",".to_string()),
1589 Value::String("CollapseDelimiters".to_string()),
1590 Value::LogicalArray(logical),
1591 ];
1592 let result = split_builtin(Value::String("a,,b".to_string()), args).expect("split");
1593 match result {
1594 Value::StringArray(array) => {
1595 assert_eq!(array.shape, vec![1, 2]);
1596 assert_eq!(array.data, vec!["a".to_string(), "b".to_string()]);
1597 }
1598 other => panic!("expected string array, got {other:?}"),
1599 }
1600 }
1601
1602 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1603 #[test]
1604 fn split_include_delimiters_accepts_tensor_scalar() {
1605 let tensor = Tensor::new(vec![1.0], vec![1, 1]).unwrap();
1606 let args = vec![
1607 Value::String(",".to_string()),
1608 Value::String("IncludeDelimiters".to_string()),
1609 Value::Tensor(tensor),
1610 ];
1611 let result = split_builtin(Value::String("a,b".to_string()), args).expect("split");
1612 match result {
1613 Value::StringArray(array) => {
1614 assert_eq!(array.shape, vec![1, 3]);
1615 assert_eq!(
1616 array.data,
1617 vec!["a".to_string(), ",".to_string(), "b".to_string()]
1618 );
1619 }
1620 other => panic!("expected string array, got {other:?}"),
1621 }
1622 }
1623
1624 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1625 #[test]
1626 fn split_cell_array_mixed_inputs() {
1627 let handles: Vec<_> = vec![
1628 runmat_gc::gc_allocate(Value::String("alpha beta".to_string())).unwrap(),
1629 runmat_gc::gc_allocate(Value::CharArray(
1630 CharArray::new("gamma".chars().collect(), 1, 5).unwrap(),
1631 ))
1632 .unwrap(),
1633 ];
1634 let cell =
1635 Value::Cell(CellArray::new_handles(handles, 1, 2).expect("cell array construction"));
1636 let result = split_builtin(cell, Vec::new()).expect("split");
1637 match result {
1638 Value::StringArray(array) => {
1639 assert_eq!(array.shape, vec![1, 4]);
1640 assert_eq!(
1641 array.data,
1642 vec![
1643 "alpha".to_string(),
1644 "beta".to_string(),
1645 "gamma".to_string(),
1646 "<missing>".to_string()
1647 ]
1648 );
1649 }
1650 other => panic!("expected string array, got {other:?}"),
1651 }
1652 }
1653
1654 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1655 #[test]
1656 fn strsplit_string_scalar_returns_string_array() {
1657 let result =
1658 strsplit_builtin(Value::String("one two three".into()), Vec::new()).expect("strsplit");
1659 match result {
1660 Value::StringArray(array) => {
1661 assert_eq!(array.shape, vec![1, 3]);
1662 assert_eq!(
1663 array.data,
1664 vec!["one".to_string(), "two".to_string(), "three".to_string()]
1665 );
1666 }
1667 other => panic!("expected string array, got {other:?}"),
1668 }
1669 }
1670
1671 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1672 #[test]
1673 fn strsplit_char_vector_returns_cell() {
1674 let input = Value::CharArray(CharArray::new("a,b".chars().collect(), 1, 3).unwrap());
1675 let result = strsplit_builtin(input, vec![Value::String(",".into())]).expect("strsplit");
1676 match result {
1677 Value::Cell(cell) => {
1678 assert_eq!(cell.rows, 1);
1679 assert_eq!(cell.cols, 2);
1680 assert_eq!(
1681 unsafe { &*cell.data[0].as_raw() },
1682 &Value::String("a".into())
1683 );
1684 assert_eq!(
1685 unsafe { &*cell.data[1].as_raw() },
1686 &Value::String("b".into())
1687 );
1688 }
1689 other => panic!("expected cell output, got {other:?}"),
1690 }
1691 }
1692
1693 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1694 #[test]
1695 fn strsplit_multi_output_returns_matches() {
1696 let _guard = crate::output_count::push_output_count(Some(2));
1697 let result = strsplit_builtin(
1698 Value::String("a,,b,".into()),
1699 vec![Value::String(",".into())],
1700 )
1701 .expect("strsplit");
1702 match result {
1703 Value::OutputList(values) => {
1704 assert_eq!(values.len(), 2);
1705 match &values[0] {
1706 Value::StringArray(array) => {
1707 assert_eq!(
1708 array.data,
1709 vec!["a".to_string(), "b".to_string(), "".to_string()]
1710 );
1711 }
1712 other => panic!("expected first output string array, got {other:?}"),
1713 }
1714 match &values[1] {
1715 Value::StringArray(array) => {
1716 assert_eq!(array.data, vec![",,".to_string(), ",".to_string()]);
1717 }
1718 other => panic!("expected second output string array, got {other:?}"),
1719 }
1720 }
1721 other => panic!("expected output list, got {other:?}"),
1722 }
1723 }
1724
1725 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1726 #[test]
1727 fn strsplit_regular_expression_mode() {
1728 let _guard = crate::output_count::push_output_count(Some(2));
1729 let result = strsplit_builtin(
1730 Value::String("1.21m/s 1.985 m/s".into()),
1731 vec![
1732 Value::String("\\s*m/s\\s*".into()),
1733 Value::String("DelimiterType".into()),
1734 Value::String("RegularExpression".into()),
1735 ],
1736 )
1737 .expect("strsplit");
1738 match result {
1739 Value::OutputList(values) => {
1740 match &values[0] {
1741 Value::StringArray(array) => {
1742 assert_eq!(
1743 array.data,
1744 vec!["1.21".to_string(), "1.985".to_string(), "".to_string()]
1745 );
1746 }
1747 other => panic!("expected split output string array, got {other:?}"),
1748 }
1749 match &values[1] {
1750 Value::StringArray(array) => {
1751 assert_eq!(array.data, vec!["m/s ".to_string(), " m/s".to_string()]);
1752 }
1753 other => panic!("expected matches output string array, got {other:?}"),
1754 }
1755 }
1756 other => panic!("expected output list, got {other:?}"),
1757 }
1758 }
1759
1760 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1761 #[test]
1762 fn strsplit_collapse_false_preserves_empty_segments() {
1763 let result = strsplit_builtin(
1764 Value::String("a,,b".into()),
1765 vec![
1766 Value::String(",".into()),
1767 Value::String("CollapseDelimiters".into()),
1768 Value::Bool(false),
1769 ],
1770 )
1771 .expect("strsplit");
1772 match result {
1773 Value::StringArray(array) => {
1774 assert_eq!(
1775 array.data,
1776 vec!["a".to_string(), "".to_string(), "b".to_string()]
1777 );
1778 }
1779 other => panic!("expected string array, got {other:?}"),
1780 }
1781 }
1782
1783 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1784 #[test]
1785 fn strsplit_rejects_nonscalar_text_inputs() {
1786 let input = Value::StringArray(
1787 StringArray::new(vec!["a b".into(), "c d".into()], vec![2, 1]).unwrap(),
1788 );
1789 let err = strsplit_builtin(input, Vec::new()).unwrap_err();
1790 assert!(err.to_string().contains("first argument"));
1791 }
1792
1793 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
1794 #[test]
1795 fn strsplit_invalid_delimiter_type_option_errors() {
1796 let err = strsplit_builtin(
1797 Value::String("a,b".into()),
1798 vec![
1799 Value::String(",".into()),
1800 Value::String("DelimiterType".into()),
1801 Value::String("BadMode".into()),
1802 ],
1803 )
1804 .unwrap_err();
1805 assert!(err.to_string().contains("DelimiterType"));
1806 }
1807
1808 #[test]
1809 fn split_type_is_string_array() {
1810 assert_eq!(
1811 string_array_type(&[Type::String], &ResolveContext::new(Vec::new())),
1812 Type::cell_of(Type::String)
1813 );
1814 }
1815}