nu_command/strings/split/
words.rs

1use crate::{grapheme_flags, grapheme_flags_const};
2use fancy_regex::Regex;
3use nu_engine::command_prelude::*;
4
5use unicode_segmentation::UnicodeSegmentation;
6
7#[derive(Clone)]
8pub struct SplitWords;
9
10impl Command for SplitWords {
11    fn name(&self) -> &str {
12        "split words"
13    }
14
15    fn signature(&self) -> Signature {
16        Signature::build("split words")
17            .input_output_types(vec![
18                (Type::String, Type::List(Box::new(Type::String))),
19                (
20                    Type::List(Box::new(Type::String)),
21                    Type::List(Box::new(Type::List(Box::new(Type::String))))
22                ),
23            ])
24            .allow_variants_without_examples(true)
25            .category(Category::Strings)
26            // .switch(
27            //     "ignore-hyphenated",
28            //     "ignore hyphenated words, splitting at the hyphen",
29            //     Some('i'),
30            // )
31            // .switch(
32            //     "ignore-apostrophes",
33            //     "ignore apostrophes in words by removing them",
34            //     Some('a'),
35            // )
36            // .switch(
37            //     "ignore-punctuation",
38            //     "ignore punctuation around words by removing them",
39            //     Some('p'),
40            // )
41            .named(
42                "min-word-length",
43                SyntaxShape::Int,
44                "The minimum word length",
45                Some('l'),
46            )
47            .switch(
48                "grapheme-clusters",
49                "measure word length in grapheme clusters (requires -l)",
50                Some('g'),
51            )
52            .switch(
53                "utf-8-bytes",
54                "measure word length in UTF-8 bytes (default; requires -l; non-ASCII chars are length 2+)",
55                Some('b'),
56            )
57    }
58
59    fn description(&self) -> &str {
60        "Split a string's words into separate rows."
61    }
62
63    fn search_terms(&self) -> Vec<&str> {
64        vec!["separate", "divide"]
65    }
66
67    fn examples(&self) -> Vec<Example> {
68        vec![
69            Example {
70                description: "Split the string's words into separate rows",
71                example: "'hello world' | split words",
72                result: Some(Value::list(
73                    vec![Value::test_string("hello"), Value::test_string("world")],
74                    Span::test_data(),
75                )),
76            },
77            Example {
78                description:
79                    "Split the string's words, of at least 3 characters, into separate rows",
80                example: "'hello to the world' | split words --min-word-length 3",
81                result: Some(Value::list(
82                    vec![
83                        Value::test_string("hello"),
84                        Value::test_string("the"),
85                        Value::test_string("world"),
86                    ],
87                    Span::test_data(),
88                )),
89            },
90            Example {
91                description:
92                    "A real-world example of splitting words",
93                example: "http get https://www.gutenberg.org/files/11/11-0.txt | str downcase | split words --min-word-length 2 | uniq --count | sort-by count --reverse | first 10",
94                result: None,
95            },
96        ]
97    }
98
99    fn is_const(&self) -> bool {
100        true
101    }
102
103    fn run(
104        &self,
105        engine_state: &EngineState,
106        stack: &mut Stack,
107        call: &Call,
108        input: PipelineData,
109    ) -> Result<PipelineData, ShellError> {
110        let word_length: Option<usize> = call.get_flag(engine_state, stack, "min-word-length")?;
111        let has_grapheme = call.has_flag(engine_state, stack, "grapheme-clusters")?;
112        let has_utf8 = call.has_flag(engine_state, stack, "utf-8-bytes")?;
113        let graphemes = grapheme_flags(engine_state, stack, call)?;
114
115        let args = Arguments {
116            word_length,
117            has_grapheme,
118            has_utf8,
119            graphemes,
120        };
121        split_words(engine_state, call, input, args)
122    }
123
124    fn run_const(
125        &self,
126        working_set: &StateWorkingSet,
127        call: &Call,
128        input: PipelineData,
129    ) -> Result<PipelineData, ShellError> {
130        let word_length: Option<usize> = call.get_flag_const(working_set, "min-word-length")?;
131        let has_grapheme = call.has_flag_const(working_set, "grapheme-clusters")?;
132        let has_utf8 = call.has_flag_const(working_set, "utf-8-bytes")?;
133        let graphemes = grapheme_flags_const(working_set, call)?;
134
135        let args = Arguments {
136            word_length,
137            has_grapheme,
138            has_utf8,
139            graphemes,
140        };
141        split_words(working_set.permanent(), call, input, args)
142    }
143}
144
145struct Arguments {
146    word_length: Option<usize>,
147    has_grapheme: bool,
148    has_utf8: bool,
149    graphemes: bool,
150}
151
152fn split_words(
153    engine_state: &EngineState,
154    call: &Call,
155    input: PipelineData,
156    args: Arguments,
157) -> Result<PipelineData, ShellError> {
158    let span = call.head;
159    // let ignore_hyphenated = call.has_flag(engine_state, stack, "ignore-hyphenated")?;
160    // let ignore_apostrophes = call.has_flag(engine_state, stack, "ignore-apostrophes")?;
161    // let ignore_punctuation = call.has_flag(engine_state, stack, "ignore-punctuation")?;
162
163    if args.word_length.is_none() {
164        if args.has_grapheme {
165            return Err(ShellError::IncompatibleParametersSingle {
166                msg: "--grapheme-clusters (-g) requires --min-word-length (-l)".to_string(),
167                span,
168            });
169        }
170        if args.has_utf8 {
171            return Err(ShellError::IncompatibleParametersSingle {
172                msg: "--utf-8-bytes (-b) requires --min-word-length (-l)".to_string(),
173                span,
174            });
175        }
176    }
177
178    input.map(
179        move |x| split_words_helper(&x, args.word_length, span, args.graphemes),
180        engine_state.signals(),
181    )
182}
183
184fn split_words_helper(v: &Value, word_length: Option<usize>, span: Span, graphemes: bool) -> Value {
185    // There are some options here with this regex.
186    // [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes
187    // [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes
188    // [^\p{L}\'] = do not match any unicode uppercase or lowercase letters or apostrophes
189    // Let's go with the unicode one in hopes that it works on more than just ascii characters
190    let regex_replace = Regex::new(r"[^\p{L}\p{N}\']").expect("regular expression error");
191    let v_span = v.span();
192
193    match v {
194        Value::Error { error, .. } => Value::error(*error.clone(), v_span),
195        v => {
196            let v_span = v.span();
197            if let Ok(s) = v.coerce_str() {
198                // let splits = s.unicode_words();
199                // let words = trim_to_words(s);
200                // let words: Vec<&str> = s.split_whitespace().collect();
201
202                let replaced_string = regex_replace.replace_all(&s, " ").to_string();
203                let words = replaced_string
204                    .split(' ')
205                    .filter_map(|s| {
206                        if s.trim() != "" {
207                            if let Some(len) = word_length {
208                                if if graphemes {
209                                    s.graphemes(true).count()
210                                } else {
211                                    s.len()
212                                } >= len
213                                {
214                                    Some(Value::string(s, v_span))
215                                } else {
216                                    None
217                                }
218                            } else {
219                                Some(Value::string(s, v_span))
220                            }
221                        } else {
222                            None
223                        }
224                    })
225                    .collect::<Vec<Value>>();
226                Value::list(words, v_span)
227            } else {
228                Value::error(
229                    ShellError::OnlySupportsThisInputType {
230                        exp_input_type: "string".into(),
231                        wrong_type: v.get_type().to_string(),
232                        dst_span: span,
233                        src_span: v_span,
234                    },
235                    v_span,
236                )
237            }
238        }
239    }
240}
241
242// original at least 1 char long
243// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{1,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
244// benchmark INCLUDING DOWNLOAD: 1sec 253ms 91µs 511ns
245//    1839 the
246//     942 and
247//     811 to
248//     695 a
249//     638 of
250//     610 it
251//     553 she
252//     546 i
253//     486 you
254//     462 said
255
256// original at least 2 chars long
257// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{2,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
258//    1839 the
259//     942 and
260//     811 to
261//     638 of
262//     610 it
263//     553 she
264//     486 you
265//     462 said
266//     435 in
267//     403 alice
268
269// regex means, replace everything that is not A-Z or a-z or ' with a space
270// ❯ $contents | str replace "[^A-Za-z\']" " " -a | split row ' ' | where ($it | str length) > 1 | uniq -i -c | sort-by count --reverse | first 10
271// benchmark: 1sec 775ms 471µs 600ns
272// ╭───┬───────┬───────╮
273// │ # │ value │ count │
274// ├───┼───────┼───────┤
275// │ 0 │ the   │  1839 │
276// │ 1 │ and   │   942 │
277// │ 2 │ to    │   811 │
278// │ 3 │ of    │   638 │
279// │ 4 │ it    │   610 │
280// │ 5 │ she   │   553 │
281// │ 6 │ you   │   486 │
282// │ 7 │ said  │   462 │
283// │ 8 │ in    │   435 │
284// │ 9 │ alice │   403 │
285// ╰───┴───────┴───────╯
286
287// $alice |str replace "[^A-Za-z\']" " " -a | split row ' ' | uniq -i -c | sort-by count --reverse | first 10
288// benchmark: 1sec 518ms 701µs 200ns
289// ╭───┬───────┬───────╮
290// │ # │ value │ count │
291// ├───┼───────┼───────┤
292// │ 0 │ the   │  1839 │
293// │ 1 │ and   │   942 │
294// │ 2 │ to    │   811 │
295// │ 3 │ a     │   695 │
296// │ 4 │ of    │   638 │
297// │ 5 │ it    │   610 │
298// │ 6 │ she   │   553 │
299// │ 7 │ i     │   546 │
300// │ 8 │ you   │   486 │
301// │ 9 │ said  │   462 │
302// ├───┼───────┼───────┤
303// │ # │ value │ count │
304// ╰───┴───────┴───────╯
305
306// s.unicode_words()
307// $alice | str downcase | split words | sort | uniq -c | sort-by count | reverse | first 10
308// benchmark: 4sec 965ms 285µs 800ns
309// ╭───┬───────┬───────╮
310// │ # │ value │ count │
311// ├───┼───────┼───────┤
312// │ 0 │ the   │  1839 │
313// │ 1 │ and   │   941 │
314// │ 2 │ to    │   811 │
315// │ 3 │ a     │   695 │
316// │ 4 │ of    │   638 │
317// │ 5 │ it    │   542 │
318// │ 6 │ she   │   538 │
319// │ 7 │ said  │   460 │
320// │ 8 │ in    │   434 │
321// │ 9 │ you   │   426 │
322// ├───┼───────┼───────┤
323// │ # │ value │ count │
324// ╰───┴───────┴───────╯
325
326// trim_to_words
327// benchmark: 5sec 992ms 76µs 200ns
328// ╭───┬───────┬───────╮
329// │ # │ value │ count │
330// ├───┼───────┼───────┤
331// │ 0 │ the   │  1829 │
332// │ 1 │ and   │   918 │
333// │ 2 │ to    │   801 │
334// │ 3 │ a     │   689 │
335// │ 4 │ of    │   632 │
336// │ 5 │ she   │   537 │
337// │ 6 │ it    │   493 │
338// │ 7 │ said  │   457 │
339// │ 8 │ in    │   430 │
340// │ 9 │ you   │   413 │
341// ├───┼───────┼───────┤
342// │ # │ value │ count │
343// ╰───┴───────┴───────╯
344
345// fn trim_to_words(content: String) -> std::vec::Vec<std::string::String> {
346//     let content: Vec<String> = content
347//         .to_lowercase()
348//         .replace(&['-'][..], " ")
349//         //should 's be replaced?
350//         .replace("'s", "")
351//         .replace(
352//             &[
353//                 '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'',
354//                 '’', '?', '!', '“', '‘',
355//             ][..],
356//             "",
357//         )
358//         .split_whitespace()
359//         .map(String::from)
360//         .collect::<Vec<String>>();
361//     content
362// }
363
364// split_whitespace()
365// benchmark: 9sec 379ms 790µs 900ns
366// ╭───┬───────┬───────╮
367// │ # │ value │ count │
368// ├───┼───────┼───────┤
369// │ 0 │ the   │  1683 │
370// │ 1 │ and   │   783 │
371// │ 2 │ to    │   778 │
372// │ 3 │ a     │   667 │
373// │ 4 │ of    │   605 │
374// │ 5 │ she   │   485 │
375// │ 6 │ said  │   416 │
376// │ 7 │ in    │   406 │
377// │ 8 │ it    │   357 │
378// │ 9 │ was   │   329 │
379// ├───┼───────┼───────┤
380// │ # │ value │ count │
381// ╰───┴───────┴───────╯
382
383// current
384// $alice | str downcase | split words | uniq -c | sort-by count --reverse | first 10
385// benchmark: 1sec 481ms 604µs 700ns
386// ╭───┬───────┬───────╮
387// │ # │ value │ count │
388// ├───┼───────┼───────┤
389// │ 0 │ the   │  1839 │
390// │ 1 │ and   │   942 │
391// │ 2 │ to    │   811 │
392// │ 3 │ a     │   695 │
393// │ 4 │ of    │   638 │
394// │ 5 │ it    │   610 │
395// │ 6 │ she   │   553 │
396// │ 7 │ i     │   546 │
397// │ 8 │ you   │   486 │
398// │ 9 │ said  │   462 │
399// ├───┼───────┼───────┤
400// │ # │ value │ count │
401// ╰───┴───────┴───────╯
402
403#[cfg(test)]
404mod test {
405    use super::*;
406    use nu_test_support::nu;
407
408    #[test]
409    fn test_incompat_flags() {
410        let out = nu!("'a' | split words -bg -l 2");
411        assert!(out.err.contains("incompatible_parameters"));
412    }
413
414    #[test]
415    fn test_incompat_flags_2() {
416        let out = nu!("'a' | split words -g");
417        assert!(out.err.contains("incompatible_parameters"));
418    }
419
420    #[test]
421    fn test_examples() {
422        use crate::test_examples;
423
424        test_examples(SplitWords {})
425    }
426    #[test]
427    fn mixed_letter_number() {
428        let actual = nu!(r#"echo "a1 b2 c3" | split words | str join ','"#);
429        assert_eq!(actual.out, "a1,b2,c3");
430    }
431}