nu_command/strings/split/words.rs
1use crate::{grapheme_flags, grapheme_flags_const};
2use fancy_regex::Regex;
3use nu_engine::command_prelude::*;
4
5use unicode_segmentation::UnicodeSegmentation;
6
7#[derive(Clone)]
8pub struct SplitWords;
9
10impl Command for SplitWords {
11 fn name(&self) -> &str {
12 "split words"
13 }
14
15 fn signature(&self) -> Signature {
16 Signature::build("split words")
17 .input_output_types(vec![
18 (Type::String, Type::List(Box::new(Type::String))),
19 (
20 Type::List(Box::new(Type::String)),
21 Type::List(Box::new(Type::List(Box::new(Type::String))))
22 ),
23 ])
24 .allow_variants_without_examples(true)
25 .category(Category::Strings)
26 // .switch(
27 // "ignore-hyphenated",
28 // "ignore hyphenated words, splitting at the hyphen",
29 // Some('i'),
30 // )
31 // .switch(
32 // "ignore-apostrophes",
33 // "ignore apostrophes in words by removing them",
34 // Some('a'),
35 // )
36 // .switch(
37 // "ignore-punctuation",
38 // "ignore punctuation around words by removing them",
39 // Some('p'),
40 // )
41 .named(
42 "min-word-length",
43 SyntaxShape::Int,
44 "The minimum word length",
45 Some('l'),
46 )
47 .switch(
48 "grapheme-clusters",
49 "measure word length in grapheme clusters (requires -l)",
50 Some('g'),
51 )
52 .switch(
53 "utf-8-bytes",
54 "measure word length in UTF-8 bytes (default; requires -l; non-ASCII chars are length 2+)",
55 Some('b'),
56 )
57 }
58
59 fn description(&self) -> &str {
60 "Split a string's words into separate rows."
61 }
62
63 fn search_terms(&self) -> Vec<&str> {
64 vec!["separate", "divide"]
65 }
66
67 fn examples(&self) -> Vec<Example> {
68 vec![
69 Example {
70 description: "Split the string's words into separate rows",
71 example: "'hello world' | split words",
72 result: Some(Value::list(
73 vec![Value::test_string("hello"), Value::test_string("world")],
74 Span::test_data(),
75 )),
76 },
77 Example {
78 description:
79 "Split the string's words, of at least 3 characters, into separate rows",
80 example: "'hello to the world' | split words --min-word-length 3",
81 result: Some(Value::list(
82 vec![
83 Value::test_string("hello"),
84 Value::test_string("the"),
85 Value::test_string("world"),
86 ],
87 Span::test_data(),
88 )),
89 },
90 Example {
91 description:
92 "A real-world example of splitting words",
93 example: "http get https://www.gutenberg.org/files/11/11-0.txt | str downcase | split words --min-word-length 2 | uniq --count | sort-by count --reverse | first 10",
94 result: None,
95 },
96 ]
97 }
98
99 fn is_const(&self) -> bool {
100 true
101 }
102
103 fn run(
104 &self,
105 engine_state: &EngineState,
106 stack: &mut Stack,
107 call: &Call,
108 input: PipelineData,
109 ) -> Result<PipelineData, ShellError> {
110 let word_length: Option<usize> = call.get_flag(engine_state, stack, "min-word-length")?;
111 let has_grapheme = call.has_flag(engine_state, stack, "grapheme-clusters")?;
112 let has_utf8 = call.has_flag(engine_state, stack, "utf-8-bytes")?;
113 let graphemes = grapheme_flags(engine_state, stack, call)?;
114
115 let args = Arguments {
116 word_length,
117 has_grapheme,
118 has_utf8,
119 graphemes,
120 };
121 split_words(engine_state, call, input, args)
122 }
123
124 fn run_const(
125 &self,
126 working_set: &StateWorkingSet,
127 call: &Call,
128 input: PipelineData,
129 ) -> Result<PipelineData, ShellError> {
130 let word_length: Option<usize> = call.get_flag_const(working_set, "min-word-length")?;
131 let has_grapheme = call.has_flag_const(working_set, "grapheme-clusters")?;
132 let has_utf8 = call.has_flag_const(working_set, "utf-8-bytes")?;
133 let graphemes = grapheme_flags_const(working_set, call)?;
134
135 let args = Arguments {
136 word_length,
137 has_grapheme,
138 has_utf8,
139 graphemes,
140 };
141 split_words(working_set.permanent(), call, input, args)
142 }
143}
144
145struct Arguments {
146 word_length: Option<usize>,
147 has_grapheme: bool,
148 has_utf8: bool,
149 graphemes: bool,
150}
151
152fn split_words(
153 engine_state: &EngineState,
154 call: &Call,
155 input: PipelineData,
156 args: Arguments,
157) -> Result<PipelineData, ShellError> {
158 let span = call.head;
159 // let ignore_hyphenated = call.has_flag(engine_state, stack, "ignore-hyphenated")?;
160 // let ignore_apostrophes = call.has_flag(engine_state, stack, "ignore-apostrophes")?;
161 // let ignore_punctuation = call.has_flag(engine_state, stack, "ignore-punctuation")?;
162
163 if args.word_length.is_none() {
164 if args.has_grapheme {
165 return Err(ShellError::IncompatibleParametersSingle {
166 msg: "--grapheme-clusters (-g) requires --min-word-length (-l)".to_string(),
167 span,
168 });
169 }
170 if args.has_utf8 {
171 return Err(ShellError::IncompatibleParametersSingle {
172 msg: "--utf-8-bytes (-b) requires --min-word-length (-l)".to_string(),
173 span,
174 });
175 }
176 }
177
178 input.map(
179 move |x| split_words_helper(&x, args.word_length, span, args.graphemes),
180 engine_state.signals(),
181 )
182}
183
184fn split_words_helper(v: &Value, word_length: Option<usize>, span: Span, graphemes: bool) -> Value {
185 // There are some options here with this regex.
186 // [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes
187 // [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes
188 // [^\p{L}\'] = do not match any unicode uppercase or lowercase letters or apostrophes
189 // Let's go with the unicode one in hopes that it works on more than just ascii characters
190 let regex_replace = Regex::new(r"[^\p{L}\p{N}\']").expect("regular expression error");
191 let v_span = v.span();
192
193 match v {
194 Value::Error { error, .. } => Value::error(*error.clone(), v_span),
195 v => {
196 let v_span = v.span();
197 if let Ok(s) = v.coerce_str() {
198 // let splits = s.unicode_words();
199 // let words = trim_to_words(s);
200 // let words: Vec<&str> = s.split_whitespace().collect();
201
202 let replaced_string = regex_replace.replace_all(&s, " ").to_string();
203 let words = replaced_string
204 .split(' ')
205 .filter_map(|s| {
206 if s.trim() != "" {
207 if let Some(len) = word_length {
208 if if graphemes {
209 s.graphemes(true).count()
210 } else {
211 s.len()
212 } >= len
213 {
214 Some(Value::string(s, v_span))
215 } else {
216 None
217 }
218 } else {
219 Some(Value::string(s, v_span))
220 }
221 } else {
222 None
223 }
224 })
225 .collect::<Vec<Value>>();
226 Value::list(words, v_span)
227 } else {
228 Value::error(
229 ShellError::OnlySupportsThisInputType {
230 exp_input_type: "string".into(),
231 wrong_type: v.get_type().to_string(),
232 dst_span: span,
233 src_span: v_span,
234 },
235 v_span,
236 )
237 }
238 }
239 }
240}
241
242// original at least 1 char long
243// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{1,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
244// benchmark INCLUDING DOWNLOAD: 1sec 253ms 91µs 511ns
245// 1839 the
246// 942 and
247// 811 to
248// 695 a
249// 638 of
250// 610 it
251// 553 she
252// 546 i
253// 486 you
254// 462 said
255
256// original at least 2 chars long
257// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{2,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
258// 1839 the
259// 942 and
260// 811 to
261// 638 of
262// 610 it
263// 553 she
264// 486 you
265// 462 said
266// 435 in
267// 403 alice
268
269// regex means, replace everything that is not A-Z or a-z or ' with a space
270// ❯ $contents | str replace "[^A-Za-z\']" " " -a | split row ' ' | where ($it | str length) > 1 | uniq -i -c | sort-by count --reverse | first 10
271// benchmark: 1sec 775ms 471µs 600ns
272// ╭───┬───────┬───────╮
273// │ # │ value │ count │
274// ├───┼───────┼───────┤
275// │ 0 │ the │ 1839 │
276// │ 1 │ and │ 942 │
277// │ 2 │ to │ 811 │
278// │ 3 │ of │ 638 │
279// │ 4 │ it │ 610 │
280// │ 5 │ she │ 553 │
281// │ 6 │ you │ 486 │
282// │ 7 │ said │ 462 │
283// │ 8 │ in │ 435 │
284// │ 9 │ alice │ 403 │
285// ╰───┴───────┴───────╯
286
287// $alice |str replace "[^A-Za-z\']" " " -a | split row ' ' | uniq -i -c | sort-by count --reverse | first 10
288// benchmark: 1sec 518ms 701µs 200ns
289// ╭───┬───────┬───────╮
290// │ # │ value │ count │
291// ├───┼───────┼───────┤
292// │ 0 │ the │ 1839 │
293// │ 1 │ and │ 942 │
294// │ 2 │ to │ 811 │
295// │ 3 │ a │ 695 │
296// │ 4 │ of │ 638 │
297// │ 5 │ it │ 610 │
298// │ 6 │ she │ 553 │
299// │ 7 │ i │ 546 │
300// │ 8 │ you │ 486 │
301// │ 9 │ said │ 462 │
302// ├───┼───────┼───────┤
303// │ # │ value │ count │
304// ╰───┴───────┴───────╯
305
306// s.unicode_words()
307// $alice | str downcase | split words | sort | uniq -c | sort-by count | reverse | first 10
308// benchmark: 4sec 965ms 285µs 800ns
309// ╭───┬───────┬───────╮
310// │ # │ value │ count │
311// ├───┼───────┼───────┤
312// │ 0 │ the │ 1839 │
313// │ 1 │ and │ 941 │
314// │ 2 │ to │ 811 │
315// │ 3 │ a │ 695 │
316// │ 4 │ of │ 638 │
317// │ 5 │ it │ 542 │
318// │ 6 │ she │ 538 │
319// │ 7 │ said │ 460 │
320// │ 8 │ in │ 434 │
321// │ 9 │ you │ 426 │
322// ├───┼───────┼───────┤
323// │ # │ value │ count │
324// ╰───┴───────┴───────╯
325
326// trim_to_words
327// benchmark: 5sec 992ms 76µs 200ns
328// ╭───┬───────┬───────╮
329// │ # │ value │ count │
330// ├───┼───────┼───────┤
331// │ 0 │ the │ 1829 │
332// │ 1 │ and │ 918 │
333// │ 2 │ to │ 801 │
334// │ 3 │ a │ 689 │
335// │ 4 │ of │ 632 │
336// │ 5 │ she │ 537 │
337// │ 6 │ it │ 493 │
338// │ 7 │ said │ 457 │
339// │ 8 │ in │ 430 │
340// │ 9 │ you │ 413 │
341// ├───┼───────┼───────┤
342// │ # │ value │ count │
343// ╰───┴───────┴───────╯
344
345// fn trim_to_words(content: String) -> std::vec::Vec<std::string::String> {
346// let content: Vec<String> = content
347// .to_lowercase()
348// .replace(&['-'][..], " ")
349// //should 's be replaced?
350// .replace("'s", "")
351// .replace(
352// &[
353// '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'',
354// '’', '?', '!', '“', '‘',
355// ][..],
356// "",
357// )
358// .split_whitespace()
359// .map(String::from)
360// .collect::<Vec<String>>();
361// content
362// }
363
364// split_whitespace()
365// benchmark: 9sec 379ms 790µs 900ns
366// ╭───┬───────┬───────╮
367// │ # │ value │ count │
368// ├───┼───────┼───────┤
369// │ 0 │ the │ 1683 │
370// │ 1 │ and │ 783 │
371// │ 2 │ to │ 778 │
372// │ 3 │ a │ 667 │
373// │ 4 │ of │ 605 │
374// │ 5 │ she │ 485 │
375// │ 6 │ said │ 416 │
376// │ 7 │ in │ 406 │
377// │ 8 │ it │ 357 │
378// │ 9 │ was │ 329 │
379// ├───┼───────┼───────┤
380// │ # │ value │ count │
381// ╰───┴───────┴───────╯
382
383// current
384// $alice | str downcase | split words | uniq -c | sort-by count --reverse | first 10
385// benchmark: 1sec 481ms 604µs 700ns
386// ╭───┬───────┬───────╮
387// │ # │ value │ count │
388// ├───┼───────┼───────┤
389// │ 0 │ the │ 1839 │
390// │ 1 │ and │ 942 │
391// │ 2 │ to │ 811 │
392// │ 3 │ a │ 695 │
393// │ 4 │ of │ 638 │
394// │ 5 │ it │ 610 │
395// │ 6 │ she │ 553 │
396// │ 7 │ i │ 546 │
397// │ 8 │ you │ 486 │
398// │ 9 │ said │ 462 │
399// ├───┼───────┼───────┤
400// │ # │ value │ count │
401// ╰───┴───────┴───────╯
402
403#[cfg(test)]
404mod test {
405 use super::*;
406 use nu_test_support::nu;
407
408 #[test]
409 fn test_incompat_flags() {
410 let out = nu!("'a' | split words -bg -l 2");
411 assert!(out.err.contains("incompatible_parameters"));
412 }
413
414 #[test]
415 fn test_incompat_flags_2() {
416 let out = nu!("'a' | split words -g");
417 assert!(out.err.contains("incompatible_parameters"));
418 }
419
420 #[test]
421 fn test_examples() {
422 use crate::test_examples;
423
424 test_examples(SplitWords {})
425 }
426 #[test]
427 fn mixed_letter_number() {
428 let actual = nu!(r#"echo "a1 b2 c3" | split words | str join ','"#);
429 assert_eq!(actual.out, "a1,b2,c3");
430 }
431}