nu_command/strings/split/
words.rs

1use crate::{grapheme_flags, grapheme_flags_const};
2use fancy_regex::Regex;
3use nu_engine::command_prelude::*;
4
5use unicode_segmentation::UnicodeSegmentation;
6
7#[derive(Clone)]
8pub struct SplitWords;
9
10impl Command for SplitWords {
11    fn name(&self) -> &str {
12        "split words"
13    }
14
15    fn signature(&self) -> Signature {
16        Signature::build("split words")
17            .input_output_types(vec![
18                (Type::String, Type::List(Box::new(Type::String))),
19                (
20                    Type::List(Box::new(Type::String)),
21                    Type::List(Box::new(Type::List(Box::new(Type::String))))
22                ),
23            ])
24            .allow_variants_without_examples(true)
25            .category(Category::Strings)
26            // .switch(
27            //     "ignore-hyphenated",
28            //     "ignore hyphenated words, splitting at the hyphen",
29            //     Some('i'),
30            // )
31            // .switch(
32            //     "ignore-apostrophes",
33            //     "ignore apostrophes in words by removing them",
34            //     Some('a'),
35            // )
36            // .switch(
37            //     "ignore-punctuation",
38            //     "ignore punctuation around words by removing them",
39            //     Some('p'),
40            // )
41            .named(
42                "min-word-length",
43                SyntaxShape::Int,
44                "The minimum word length",
45                Some('l'),
46            )
47            .switch(
48                "grapheme-clusters",
49                "measure word length in grapheme clusters (requires -l)",
50                Some('g'),
51            )
52            .switch(
53                "utf-8-bytes",
54                "measure word length in UTF-8 bytes (default; requires -l; non-ASCII chars are length 2+)",
55                Some('b'),
56            )
57    }
58
59    fn description(&self) -> &str {
60        "Split a string's words into separate rows."
61    }
62
63    fn search_terms(&self) -> Vec<&str> {
64        vec!["separate", "divide"]
65    }
66
67    fn examples(&self) -> Vec<Example<'_>> {
68        vec![
69            Example {
70                description: "Split the string's words into separate rows",
71                example: "'hello world' | split words",
72                result: Some(Value::list(
73                    vec![Value::test_string("hello"), Value::test_string("world")],
74                    Span::test_data(),
75                )),
76            },
77            Example {
78                description: "Split the string's words, of at least 3 characters, into separate rows",
79                example: "'hello to the world' | split words --min-word-length 3",
80                result: Some(Value::list(
81                    vec![
82                        Value::test_string("hello"),
83                        Value::test_string("the"),
84                        Value::test_string("world"),
85                    ],
86                    Span::test_data(),
87                )),
88            },
89            Example {
90                description: "A real-world example of splitting words",
91                example: "http get https://www.gutenberg.org/files/11/11-0.txt | str downcase | split words --min-word-length 2 | uniq --count | sort-by count --reverse | first 10",
92                result: None,
93            },
94        ]
95    }
96
97    fn is_const(&self) -> bool {
98        true
99    }
100
101    fn run(
102        &self,
103        engine_state: &EngineState,
104        stack: &mut Stack,
105        call: &Call,
106        input: PipelineData,
107    ) -> Result<PipelineData, ShellError> {
108        let word_length: Option<usize> = call.get_flag(engine_state, stack, "min-word-length")?;
109        let has_grapheme = call.has_flag(engine_state, stack, "grapheme-clusters")?;
110        let has_utf8 = call.has_flag(engine_state, stack, "utf-8-bytes")?;
111        let graphemes = grapheme_flags(engine_state, stack, call)?;
112
113        let args = Arguments {
114            word_length,
115            has_grapheme,
116            has_utf8,
117            graphemes,
118        };
119        split_words(engine_state, call, input, args)
120    }
121
122    fn run_const(
123        &self,
124        working_set: &StateWorkingSet,
125        call: &Call,
126        input: PipelineData,
127    ) -> Result<PipelineData, ShellError> {
128        let word_length: Option<usize> = call.get_flag_const(working_set, "min-word-length")?;
129        let has_grapheme = call.has_flag_const(working_set, "grapheme-clusters")?;
130        let has_utf8 = call.has_flag_const(working_set, "utf-8-bytes")?;
131        let graphemes = grapheme_flags_const(working_set, call)?;
132
133        let args = Arguments {
134            word_length,
135            has_grapheme,
136            has_utf8,
137            graphemes,
138        };
139        split_words(working_set.permanent(), call, input, args)
140    }
141}
142
143struct Arguments {
144    word_length: Option<usize>,
145    has_grapheme: bool,
146    has_utf8: bool,
147    graphemes: bool,
148}
149
150fn split_words(
151    engine_state: &EngineState,
152    call: &Call,
153    input: PipelineData,
154    args: Arguments,
155) -> Result<PipelineData, ShellError> {
156    let span = call.head;
157    // let ignore_hyphenated = call.has_flag(engine_state, stack, "ignore-hyphenated")?;
158    // let ignore_apostrophes = call.has_flag(engine_state, stack, "ignore-apostrophes")?;
159    // let ignore_punctuation = call.has_flag(engine_state, stack, "ignore-punctuation")?;
160
161    if args.word_length.is_none() {
162        if args.has_grapheme {
163            return Err(ShellError::IncompatibleParametersSingle {
164                msg: "--grapheme-clusters (-g) requires --min-word-length (-l)".to_string(),
165                span,
166            });
167        }
168        if args.has_utf8 {
169            return Err(ShellError::IncompatibleParametersSingle {
170                msg: "--utf-8-bytes (-b) requires --min-word-length (-l)".to_string(),
171                span,
172            });
173        }
174    }
175
176    input.map(
177        move |x| split_words_helper(&x, args.word_length, span, args.graphemes),
178        engine_state.signals(),
179    )
180}
181
182fn split_words_helper(v: &Value, word_length: Option<usize>, span: Span, graphemes: bool) -> Value {
183    // There are some options here with this regex.
184    // [^A-Za-z\'] = do not match uppercase or lowercase letters or apostrophes
185    // [^[:alpha:]\'] = do not match any uppercase or lowercase letters or apostrophes
186    // [^\p{L}\'] = do not match any unicode uppercase or lowercase letters or apostrophes
187    // Let's go with the unicode one in hopes that it works on more than just ascii characters
188    let regex_replace = Regex::new(r"[^\p{L}\p{N}\']").expect("regular expression error");
189    let v_span = v.span();
190
191    match v {
192        Value::Error { error, .. } => Value::error(*error.clone(), v_span),
193        v => {
194            let v_span = v.span();
195            if let Ok(s) = v.as_str() {
196                // let splits = s.unicode_words();
197                // let words = trim_to_words(s);
198                // let words: Vec<&str> = s.split_whitespace().collect();
199
200                let replaced_string = regex_replace.replace_all(s, " ").to_string();
201                let words = replaced_string
202                    .split(' ')
203                    .filter_map(|s| {
204                        if s.trim() != "" {
205                            if let Some(len) = word_length {
206                                if if graphemes {
207                                    s.graphemes(true).count()
208                                } else {
209                                    s.len()
210                                } >= len
211                                {
212                                    Some(Value::string(s, v_span))
213                                } else {
214                                    None
215                                }
216                            } else {
217                                Some(Value::string(s, v_span))
218                            }
219                        } else {
220                            None
221                        }
222                    })
223                    .collect::<Vec<Value>>();
224                Value::list(words, v_span)
225            } else {
226                Value::error(
227                    ShellError::OnlySupportsThisInputType {
228                        exp_input_type: "string".into(),
229                        wrong_type: v.get_type().to_string(),
230                        dst_span: span,
231                        src_span: v_span,
232                    },
233                    v_span,
234                )
235            }
236        }
237    }
238}
239
240// original at least 1 char long
241// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{1,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
242// benchmark INCLUDING DOWNLOAD: 1sec 253ms 91µs 511ns
243//    1839 the
244//     942 and
245//     811 to
246//     695 a
247//     638 of
248//     610 it
249//     553 she
250//     546 i
251//     486 you
252//     462 said
253
254// original at least 2 chars long
255// curl -sL "https://www.gutenberg.org/files/11/11-0.txt" | tr '[:upper:]' '[:lower:]' | grep -oE "[a-z\']{2,}" | ^sort | ^uniq -c | ^sort -nr | head -n 10
256//    1839 the
257//     942 and
258//     811 to
259//     638 of
260//     610 it
261//     553 she
262//     486 you
263//     462 said
264//     435 in
265//     403 alice
266
267// regex means, replace everything that is not A-Z or a-z or ' with a space
268// ❯ $contents | str replace "[^A-Za-z\']" " " -a | split row ' ' | where ($it | str length) > 1 | uniq -i -c | sort-by count --reverse | first 10
269// benchmark: 1sec 775ms 471µs 600ns
270// ╭───┬───────┬───────╮
271// │ # │ value │ count │
272// ├───┼───────┼───────┤
273// │ 0 │ the   │  1839 │
274// │ 1 │ and   │   942 │
275// │ 2 │ to    │   811 │
276// │ 3 │ of    │   638 │
277// │ 4 │ it    │   610 │
278// │ 5 │ she   │   553 │
279// │ 6 │ you   │   486 │
280// │ 7 │ said  │   462 │
281// │ 8 │ in    │   435 │
282// │ 9 │ alice │   403 │
283// ╰───┴───────┴───────╯
284
285// $alice |str replace "[^A-Za-z\']" " " -a | split row ' ' | uniq -i -c | sort-by count --reverse | first 10
286// benchmark: 1sec 518ms 701µs 200ns
287// ╭───┬───────┬───────╮
288// │ # │ value │ count │
289// ├───┼───────┼───────┤
290// │ 0 │ the   │  1839 │
291// │ 1 │ and   │   942 │
292// │ 2 │ to    │   811 │
293// │ 3 │ a     │   695 │
294// │ 4 │ of    │   638 │
295// │ 5 │ it    │   610 │
296// │ 6 │ she   │   553 │
297// │ 7 │ i     │   546 │
298// │ 8 │ you   │   486 │
299// │ 9 │ said  │   462 │
300// ├───┼───────┼───────┤
301// │ # │ value │ count │
302// ╰───┴───────┴───────╯
303
304// s.unicode_words()
305// $alice | str downcase | split words | sort | uniq -c | sort-by count | reverse | first 10
306// benchmark: 4sec 965ms 285µs 800ns
307// ╭───┬───────┬───────╮
308// │ # │ value │ count │
309// ├───┼───────┼───────┤
310// │ 0 │ the   │  1839 │
311// │ 1 │ and   │   941 │
312// │ 2 │ to    │   811 │
313// │ 3 │ a     │   695 │
314// │ 4 │ of    │   638 │
315// │ 5 │ it    │   542 │
316// │ 6 │ she   │   538 │
317// │ 7 │ said  │   460 │
318// │ 8 │ in    │   434 │
319// │ 9 │ you   │   426 │
320// ├───┼───────┼───────┤
321// │ # │ value │ count │
322// ╰───┴───────┴───────╯
323
324// trim_to_words
325// benchmark: 5sec 992ms 76µs 200ns
326// ╭───┬───────┬───────╮
327// │ # │ value │ count │
328// ├───┼───────┼───────┤
329// │ 0 │ the   │  1829 │
330// │ 1 │ and   │   918 │
331// │ 2 │ to    │   801 │
332// │ 3 │ a     │   689 │
333// │ 4 │ of    │   632 │
334// │ 5 │ she   │   537 │
335// │ 6 │ it    │   493 │
336// │ 7 │ said  │   457 │
337// │ 8 │ in    │   430 │
338// │ 9 │ you   │   413 │
339// ├───┼───────┼───────┤
340// │ # │ value │ count │
341// ╰───┴───────┴───────╯
342
343// fn trim_to_words(content: String) -> std::vec::Vec<std::string::String> {
344//     let content: Vec<String> = content
345//         .to_lowercase()
346//         .replace(&['-'][..], " ")
347//         //should 's be replaced?
348//         .replace("'s", "")
349//         .replace(
350//             &[
351//                 '(', ')', ',', '\"', '.', ';', ':', '=', '[', ']', '{', '}', '-', '_', '/', '\'',
352//                 '’', '?', '!', '“', '‘',
353//             ][..],
354//             "",
355//         )
356//         .split_whitespace()
357//         .map(String::from)
358//         .collect::<Vec<String>>();
359//     content
360// }
361
362// split_whitespace()
363// benchmark: 9sec 379ms 790µs 900ns
364// ╭───┬───────┬───────╮
365// │ # │ value │ count │
366// ├───┼───────┼───────┤
367// │ 0 │ the   │  1683 │
368// │ 1 │ and   │   783 │
369// │ 2 │ to    │   778 │
370// │ 3 │ a     │   667 │
371// │ 4 │ of    │   605 │
372// │ 5 │ she   │   485 │
373// │ 6 │ said  │   416 │
374// │ 7 │ in    │   406 │
375// │ 8 │ it    │   357 │
376// │ 9 │ was   │   329 │
377// ├───┼───────┼───────┤
378// │ # │ value │ count │
379// ╰───┴───────┴───────╯
380
381// current
382// $alice | str downcase | split words | uniq -c | sort-by count --reverse | first 10
383// benchmark: 1sec 481ms 604µs 700ns
384// ╭───┬───────┬───────╮
385// │ # │ value │ count │
386// ├───┼───────┼───────┤
387// │ 0 │ the   │  1839 │
388// │ 1 │ and   │   942 │
389// │ 2 │ to    │   811 │
390// │ 3 │ a     │   695 │
391// │ 4 │ of    │   638 │
392// │ 5 │ it    │   610 │
393// │ 6 │ she   │   553 │
394// │ 7 │ i     │   546 │
395// │ 8 │ you   │   486 │
396// │ 9 │ said  │   462 │
397// ├───┼───────┼───────┤
398// │ # │ value │ count │
399// ╰───┴───────┴───────╯
400
401#[cfg(test)]
402mod test {
403    use super::*;
404    use nu_test_support::nu;
405
406    #[test]
407    fn test_incompat_flags() {
408        let out = nu!("'a' | split words -bg -l 2");
409        assert!(out.err.contains("incompatible_parameters"));
410    }
411
412    #[test]
413    fn test_incompat_flags_2() {
414        let out = nu!("'a' | split words -g");
415        assert!(out.err.contains("incompatible_parameters"));
416    }
417
418    #[test]
419    fn test_examples() {
420        use crate::test_examples;
421
422        test_examples(SplitWords {})
423    }
424    #[test]
425    fn mixed_letter_number() {
426        let actual = nu!(r#"echo "a1 b2 c3" | split words | str join ','"#);
427        assert_eq!(actual.out, "a1,b2,c3");
428    }
429}