nu_command/strings/split/words.rs
1use crate::{grapheme_flags, grapheme_flags_const};
2use fancy_regex::Regex;
3use nu_engine::command_prelude::*;
4
5use unicode_segmentation::UnicodeSegmentation;
6
7#[derive(Clone)]
8pub struct SplitWords;
9
10impl Command for SplitWords {
11 fn name(&self) -> &str {
12 "split words"
13 }
14
15 fn signature(&self) -> Signature {
16 Signature::build("split words")
17 .input_output_types(vec![
18 (Type::String, Type::List(Box::new(Type::String))),
19 (
20 Type::List(Box::new(Type::String)),
21 Type::List(Box::new(Type::List(Box::new(Type::String))))
22 ),
23 ])
24 .allow_variants_without_examples(true)
25 .category(Category::Strings)
26 // .switch(
27 // "ignore-hyphenated",
28 // "ignore hyphenated words, splitting at the hyphen",
29 // Some('i'),
30 // )
31 // .switch(
32 // "ignore-apostrophes",
33 // "ignore apostrophes in words by removing them",
34 // Some('a'),
35 // )
36 // .switch(
37 // "ignore-punctuation",
38 // "ignore punctuation around words by removing them",
39 // Some('p'),
40 // )
41 .named(
42 "min-word-length",
43 SyntaxShape::Int,
44 "The minimum word length",
45 Some('l'),
46 )
47 .switch(
48 "grapheme-clusters",
49 "measure word length in grapheme clusters (requires -l)",
50 Some('g'),
51 )
52 .switch(
53 "utf-8-bytes",
54 "measure word length in UTF-8 bytes (default; requires -l; non-ASCII chars are length 2+)",
55 Some('b'),
56 )
57 }
58
59 fn description(&self) -> &str {
60 "Split a string's words into separate rows."
61 }
62
63 fn search_terms(&self) -> Vec<&str> {
64 vec!["separate", "divide"]
65 }
66
67 fn examples(&self) -> Vec<Example<'_>> {
68 vec![
69 Example {
70 description: "Split the string's words into separate rows",
71 example: "'hello world' | split words",
72 result: Some(Value::list(
73 vec![Value::test_string("hello"), Value::test_string("world")],
74 Span::test_data(),
75 )),
76 },
77 Example {
78 description: "Split the string's words, of at least 3 characters, into separate rows",
79 example: "'hello to the world' | split words --min-word-length 3",
80 result: Some(Value::list(
81 vec![
82 Value::test_string("hello"),
83 Value::test_string("the"),
84 Value::test_string("world"),
85 ],
86 Span::test_data(),
87 )),
88 },
89 Example {
90 description: "A real-world example of splitting words",
91 example: "http get https://www.gutenberg.org/files/11/11-0.txt | str downcase | split words --min-word-length 2 | uniq --count | sort-by count --reverse | first 10",
92 result: None,
93 },
94 ]
95 }
96
97 fn is_const(&self) -> bool {
98 true
99 }
100
101 fn run(
102 &self,
103 engine_state: &EngineState,
104 stack: &mut Stack,
105 call: &Call,
106 input: PipelineData,
107 ) -> Result<PipelineData, ShellError> {
108 let word_length: Option<usize> = call.get_flag(engine_state, stack, "min-word-length")?;
109 let has_grapheme = call.has_flag(engine_state, stack, "grapheme-clusters")?;
110 let has_utf8 = call.has_flag(engine_state, stack, "utf-8-bytes")?;
111 let graphemes = grapheme_flags(engine_state, stack, call)?;
112
113 let args = Arguments {
114 word_length,
115 has_grapheme,
116 has_utf8,
117 graphemes,
118 };
119 split_words(engine_state, call, input, args)
120 }
121
122 fn run_const(
123 &self,
124 working_set: &StateWorkingSet,
125 call: &Call,
126 input: PipelineData,
127 ) -> Result<PipelineData, ShellError> {
128 let word_length: Option<usize> = call.get_flag_const(working_set, "min-word-length")?;
129 let has_grapheme = call.has_flag_const(working_set, "grapheme-clusters")?;
130 let has_utf8 = call.has_flag_const(working_set, "utf-8-bytes")?;
131 let graphemes = grapheme_flags_const(working_set, call)?;
132
133 let args = Arguments {
134 word_length,
135 has_grapheme,
136 has_utf8,
137 graphemes,
138 };
139 split_words(working_set.permanent(), call, input, args)
140 }
141}
142
143struct Arguments {
144 word_length: Option<usize>,
145 has_grapheme: bool,
146 has_utf8: bool,
147 graphemes: bool,
148}
149
150fn split_words(
151 engine_state: &EngineState,
152 call: &Call,
153 input: PipelineData,
154 args: Arguments,
155) -> Result<PipelineData, ShellError> {
156 let span = call.head;
157 // let ignore_hyphenated = call.has_flag(engine_state, stack, "ignore-hyphenated")?;
158 // let ignore_apostrophes = call.has_flag(engine_state, stack, "ignore-apostrophes")?;
159 // let ignore_punctuation = call.has_flag(engine_state, stack, "ignore-punctuation")?;
160
161 if args.word_length.is_none() {
162 if args.has_grapheme {
163 return Err(ShellError::IncompatibleParametersSingle {
164 msg: "--grapheme-clusters (-g) requires --min-word-length (-l)".to_string(),
165 span,
166 });
167 }
168 if args.has_utf8 {
169 return Err(ShellError::IncompatibleParametersSingle {
170 msg: "--utf-8-bytes (-b) requires --min-word-length (-l)".to_string(),
171 span,
172 });
173 }
174 }
175
176 input.map(
177 move |x| split_words_helper(&x, args.word_length, span, args.graphemes),
178 engine_state.signals(),
179 )
180}
181
182fn split_words_helper(v: &Value, word_length: Option<usize>, span: Span, graphemes: bool) -> Value {
183 // There are some options here with this regex.
184 // [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes
185 // [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes
186 // [^\p{L}\'] = do not match any unicode uppercase or lowercase letters or apostrophes
187 // Let's go with the unicode one in hopes that it works on more than just ascii characters
188 let regex_replace = Regex::new(r"[^\p{L}\p{N}\']").expect("regular expression error");
189 let v_span = v.span();
190
191 match v {
192 Value::Error { error, .. } => Value::error(*error.clone(), v_span),
193 v => {
194 let v_span = v.span();
195 if let Ok(s) = v.as_str() {
196 // let splits = s.unicode_words();
197 // let words = trim_to_words(s);
198 // let words: Vec<&str> = s.split_whitespace().collect();
199
200 let replaced_string = regex_replace.replace_all(s, " ").to_string();
201 let words = replaced_string
202 .split(' ')
203 .filter_map(|s| {
204 if s.trim() != "" {
205 if let Some(len) = word_length {
206 if if graphemes {
207 s.graphemes(true).count()
208 } else {
209 s.len()
210 } >= len
211 {
212 Some(Value::string(s, v_span))
213 } else {
214 None
215 }
216 } else {
217 Some(Value::string(s, v_span))
218 }
219 } else {
220 None
221 }
222 })
223 .collect::<Vec<Value>>();
224 Value::list(words, v_span)
225 } else {
226 Value::error(
227 ShellError::OnlySupportsThisInputType {
228 exp_input_type: "string".into(),
229 wrong_type: v.get_type().to_string(),
230 dst_span: span,
231 src_span: v_span,
232 },
233 v_span,
234 )
235 }
236 }
237 }
238}
239
240// original at least 1 char long
241// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{1,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
242// benchmark INCLUDING DOWNLOAD: 1sec 253ms 91µs 511ns
243// 1839 the
244// 942 and
245// 811 to
246// 695 a
247// 638 of
248// 610 it
249// 553 she
250// 546 i
251// 486 you
252// 462 said
253
254// original at least 2 chars long
255// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{2,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
256// 1839 the
257// 942 and
258// 811 to
259// 638 of
260// 610 it
261// 553 she
262// 486 you
263// 462 said
264// 435 in
265// 403 alice
266
267// regex means, replace everything that is not A-Z or a-z or ' with a space
268// ❯ $contents | str replace "[^A-Za-z\']" " " -a | split row ' ' | where ($it | str length) > 1 | uniq -i -c | sort-by count --reverse | first 10
269// benchmark: 1sec 775ms 471µs 600ns
270// ╭───┬───────┬───────╮
271// │ # │ value │ count │
272// ├───┼───────┼───────┤
273// │ 0 │ the │ 1839 │
274// │ 1 │ and │ 942 │
275// │ 2 │ to │ 811 │
276// │ 3 │ of │ 638 │
277// │ 4 │ it │ 610 │
278// │ 5 │ she │ 553 │
279// │ 6 │ you │ 486 │
280// │ 7 │ said │ 462 │
281// │ 8 │ in │ 435 │
282// │ 9 │ alice │ 403 │
283// ╰───┴───────┴───────╯
284
285// $alice |str replace "[^A-Za-z\']" " " -a | split row ' ' | uniq -i -c | sort-by count --reverse | first 10
286// benchmark: 1sec 518ms 701µs 200ns
287// ╭───┬───────┬───────╮
288// │ # │ value │ count │
289// ├───┼───────┼───────┤
290// │ 0 │ the │ 1839 │
291// │ 1 │ and │ 942 │
292// │ 2 │ to │ 811 │
293// │ 3 │ a │ 695 │
294// │ 4 │ of │ 638 │
295// │ 5 │ it │ 610 │
296// │ 6 │ she │ 553 │
297// │ 7 │ i │ 546 │
298// │ 8 │ you │ 486 │
299// │ 9 │ said │ 462 │
300// ├───┼───────┼───────┤
301// │ # │ value │ count │
302// ╰───┴───────┴───────╯
303
304// s.unicode_words()
305// $alice | str downcase | split words | sort | uniq -c | sort-by count | reverse | first 10
306// benchmark: 4sec 965ms 285µs 800ns
307// ╭───┬───────┬───────╮
308// │ # │ value │ count │
309// ├───┼───────┼───────┤
310// │ 0 │ the │ 1839 │
311// │ 1 │ and │ 941 │
312// │ 2 │ to │ 811 │
313// │ 3 │ a │ 695 │
314// │ 4 │ of │ 638 │
315// │ 5 │ it │ 542 │
316// │ 6 │ she │ 538 │
317// │ 7 │ said │ 460 │
318// │ 8 │ in │ 434 │
319// │ 9 │ you │ 426 │
320// ├───┼───────┼───────┤
321// │ # │ value │ count │
322// ╰───┴───────┴───────╯
323
324// trim_to_words
325// benchmark: 5sec 992ms 76µs 200ns
326// ╭───┬───────┬───────╮
327// │ # │ value │ count │
328// ├───┼───────┼───────┤
329// │ 0 │ the │ 1829 │
330// │ 1 │ and │ 918 │
331// │ 2 │ to │ 801 │
332// │ 3 │ a │ 689 │
333// │ 4 │ of │ 632 │
334// │ 5 │ she │ 537 │
335// │ 6 │ it │ 493 │
336// │ 7 │ said │ 457 │
337// │ 8 │ in │ 430 │
338// │ 9 │ you │ 413 │
339// ├───┼───────┼───────┤
340// │ # │ value │ count │
341// ╰───┴───────┴───────╯
342
343// fn trim_to_words(content: String) -> std::vec::Vec<std::string::String> {
344// let content: Vec<String> = content
345// .to_lowercase()
346// .replace(&['-'][..], " ")
347// //should 's be replaced?
348// .replace("'s", "")
349// .replace(
350// &[
351// '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'',
352// '’', '?', '!', '“', '‘',
353// ][..],
354// "",
355// )
356// .split_whitespace()
357// .map(String::from)
358// .collect::<Vec<String>>();
359// content
360// }
361
362// split_whitespace()
363// benchmark: 9sec 379ms 790µs 900ns
364// ╭───┬───────┬───────╮
365// │ # │ value │ count │
366// ├───┼───────┼───────┤
367// │ 0 │ the │ 1683 │
368// │ 1 │ and │ 783 │
369// │ 2 │ to │ 778 │
370// │ 3 │ a │ 667 │
371// │ 4 │ of │ 605 │
372// │ 5 │ she │ 485 │
373// │ 6 │ said │ 416 │
374// │ 7 │ in │ 406 │
375// │ 8 │ it │ 357 │
376// │ 9 │ was │ 329 │
377// ├───┼───────┼───────┤
378// │ # │ value │ count │
379// ╰───┴───────┴───────╯
380
381// current
382// $alice | str downcase | split words | uniq -c | sort-by count --reverse | first 10
383// benchmark: 1sec 481ms 604µs 700ns
384// ╭───┬───────┬───────╮
385// │ # │ value │ count │
386// ├───┼───────┼───────┤
387// │ 0 │ the │ 1839 │
388// │ 1 │ and │ 942 │
389// │ 2 │ to │ 811 │
390// │ 3 │ a │ 695 │
391// │ 4 │ of │ 638 │
392// │ 5 │ it │ 610 │
393// │ 6 │ she │ 553 │
394// │ 7 │ i │ 546 │
395// │ 8 │ you │ 486 │
396// │ 9 │ said │ 462 │
397// ├───┼───────┼───────┤
398// │ # │ value │ count │
399// ╰───┴───────┴───────╯
400
401#[cfg(test)]
402mod test {
403 use super::*;
404 use nu_test_support::nu;
405
406 #[test]
407 fn test_incompat_flags() {
408 let out = nu!("'a' | split words -bg -l 2");
409 assert!(out.err.contains("incompatible_parameters"));
410 }
411
412 #[test]
413 fn test_incompat_flags_2() {
414 let out = nu!("'a' | split words -g");
415 assert!(out.err.contains("incompatible_parameters"));
416 }
417
418 #[test]
419 fn test_examples() {
420 use crate::test_examples;
421
422 test_examples(SplitWords {})
423 }
424 #[test]
425 fn mixed_letter_number() {
426 let actual = nu!(r#"echo "a1 b2 c3" | split words | str join ','"#);
427 assert_eq!(actual.out, "a1,b2,c3");
428 }
429}