nu_command/strings/str_/
stats.rs

1use fancy_regex::Regex;
2use nu_engine::command_prelude::*;
3
4use std::collections::BTreeMap;
5use std::{fmt, str};
6use unicode_segmentation::UnicodeSegmentation;
7
8// borrowed liberally from here https://github.com/dead10ck/uwc
9pub type Counted = BTreeMap<Counter, usize>;
10
11#[derive(Clone)]
12pub struct StrStats;
13
14impl Command for StrStats {
15    fn name(&self) -> &str {
16        "str stats"
17    }
18
19    fn signature(&self) -> Signature {
20        Signature::build("str stats")
21            .category(Category::Strings)
22            .input_output_types(vec![(Type::String, Type::record())])
23    }
24
25    fn description(&self) -> &str {
26        "Gather word count statistics on the text."
27    }
28
29    fn search_terms(&self) -> Vec<&str> {
30        vec!["count", "word", "character", "unicode", "wc"]
31    }
32
33    fn is_const(&self) -> bool {
34        true
35    }
36
37    fn run(
38        &self,
39        engine_state: &EngineState,
40        _stack: &mut Stack,
41        call: &Call,
42        input: PipelineData,
43    ) -> Result<PipelineData, ShellError> {
44        stats(engine_state, call, input)
45    }
46
47    fn run_const(
48        &self,
49        working_set: &StateWorkingSet,
50        call: &Call,
51        input: PipelineData,
52    ) -> Result<PipelineData, ShellError> {
53        stats(working_set.permanent(), call, input)
54    }
55
56    fn examples(&self) -> Vec<Example<'_>> {
57        vec![
58            Example {
59                description: "Count the number of words in a string",
60                example: r#""There are seven words in this sentence" | str stats"#,
61                result: Some(Value::test_record(record! {
62                        "lines" =>     Value::test_int(1),
63                        "words" =>     Value::test_int(7),
64                        "bytes" =>     Value::test_int(38),
65                        "chars" =>     Value::test_int(38),
66                        "graphemes" => Value::test_int(38),
67                        "unicode-width" => Value::test_int(38),
68                })),
69            },
70            Example {
71                description: "Counts unicode characters",
72                example: r#"'今天天气真好' | str stats"#,
73                result: Some(Value::test_record(record! {
74                        "lines" =>     Value::test_int(1),
75                        "words" =>     Value::test_int(6),
76                        "bytes" =>     Value::test_int(18),
77                        "chars" =>     Value::test_int(6),
78                        "graphemes" => Value::test_int(6),
79                        "unicode-width" => Value::test_int(12),
80                })),
81            },
82            Example {
83                description: "Counts Unicode characters correctly in a string",
84                example: r#""Amélie Amelie" | str stats"#,
85                result: Some(Value::test_record(record! {
86                        "lines" =>     Value::test_int(1),
87                        "words" =>     Value::test_int(2),
88                        "bytes" =>     Value::test_int(15),
89                        "chars" =>     Value::test_int(14),
90                        "graphemes" => Value::test_int(13),
91                        "unicode-width" => Value::test_int(13),
92                })),
93            },
94        ]
95    }
96}
97
98fn stats(
99    engine_state: &EngineState,
100    call: &Call,
101    input: PipelineData,
102) -> Result<PipelineData, ShellError> {
103    let span = call.head;
104    // This doesn't match explicit nulls
105    if let PipelineData::Empty = input {
106        return Err(ShellError::PipelineEmpty { dst_span: span });
107    }
108    input.map(
109        move |v| {
110            let value_span = v.span();
111            let type_ = v.get_type();
112            // First, obtain the span. If this fails, propagate the error that results.
113            if let Value::Error { error, .. } = v {
114                return Value::error(*error, span);
115            }
116            // Now, check if it's a string.
117            match v.coerce_into_string() {
118                Ok(s) => counter(&s, span),
119                Err(_) => Value::error(
120                    ShellError::OnlySupportsThisInputType {
121                        exp_input_type: "string".into(),
122                        wrong_type: type_.to_string(),
123                        dst_span: span,
124                        src_span: value_span,
125                    },
126                    span,
127                ),
128            }
129        },
130        engine_state.signals(),
131    )
132}
133
134fn counter(contents: &str, span: Span) -> Value {
135    let counts = uwc_count(&ALL_COUNTERS[..], contents);
136
137    fn get_count(counts: &BTreeMap<Counter, usize>, counter: Counter, span: Span) -> Value {
138        Value::int(counts.get(&counter).copied().unwrap_or(0) as i64, span)
139    }
140
141    let record = record! {
142        "lines" => get_count(&counts, Counter::Lines, span),
143        "words" => get_count(&counts, Counter::Words, span),
144        "bytes" => get_count(&counts, Counter::Bytes, span),
145        "chars" => get_count(&counts, Counter::CodePoints, span),
146        "graphemes" => get_count(&counts, Counter::GraphemeClusters, span),
147        "unicode-width" => get_count(&counts, Counter::UnicodeWidth, span),
148    };
149
150    Value::record(record, span)
151}
152
153// /// Take all the counts in `other_counts` and sum them into `accum`.
154// pub fn sum_counts(accum: &mut Counted, other_counts: &Counted) {
155//     for (counter, count) in other_counts {
156//         let entry = accum.entry(*counter).or_insert(0);
157//         *entry += count;
158//     }
159// }
160
161// /// Sums all the `Counted` instances into a new one.
162// pub fn sum_all_counts<'a, I>(counts: I) -> Counted
163// where
164//     I: IntoIterator<Item = &'a Counted>,
165// {
166//     let mut totals = BTreeMap::new();
167//     for counts in counts {
168//         sum_counts(&mut totals, counts);
169//     }
170//     totals
171// }
172
173/// Something that counts things in `&str`s.
174pub trait Count {
175    /// Counts something in the given `&str`.
176    fn count(&self, s: &str) -> usize;
177}
178
179impl Count for Counter {
180    fn count(&self, s: &str) -> usize {
181        match *self {
182            Counter::GraphemeClusters => s.graphemes(true).count(),
183            Counter::Bytes => s.len(),
184            Counter::Lines => {
185                const LF: &str = "\n"; // 0xe0000a
186                const CR: &str = "\r"; // 0xe0000d
187                const CRLF: &str = "\r\n"; // 0xe00d0a
188                const NEL: &str = "\u{0085}"; // 0x00c285
189                const FF: &str = "\u{000C}"; // 0x00000c
190                const LS: &str = "\u{2028}"; // 0xe280a8
191                const PS: &str = "\u{2029}"; // 0xe280a9
192
193                // use regex here because it can search for CRLF first and not duplicate the count
194                let line_ending_types = [CRLF, LF, CR, NEL, FF, LS, PS];
195                let pattern = &line_ending_types.join("|");
196                let newline_pattern = Regex::new(pattern).expect("Unable to create regex");
197                let line_endings = newline_pattern
198                    .find_iter(s)
199                    .map(|f| match f {
200                        Ok(mat) => mat.as_str().to_string(),
201                        Err(_) => "".to_string(),
202                    })
203                    .collect::<Vec<String>>();
204
205                let has_line_ending_suffix =
206                    line_ending_types.iter().any(|&suffix| s.ends_with(suffix));
207                // eprintln!("suffix = {}", has_line_ending_suffix);
208
209                if has_line_ending_suffix {
210                    line_endings.len()
211                } else {
212                    line_endings.len() + 1
213                }
214            }
215            Counter::Words => s.unicode_words().count(),
216            Counter::CodePoints => s.chars().count(),
217            Counter::UnicodeWidth => unicode_width::UnicodeWidthStr::width(s),
218        }
219    }
220}
221
222/// Different types of counters.
223#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)]
224pub enum Counter {
225    /// Counts lines.
226    Lines,
227
228    /// Counts words.
229    Words,
230
231    /// Counts the total number of bytes.
232    Bytes,
233
234    /// Counts grapheme clusters. The input is required to be valid UTF-8.
235    GraphemeClusters,
236
237    /// Counts unicode code points
238    CodePoints,
239
240    /// Counts the width of the string
241    UnicodeWidth,
242}
243
244/// A convenience array of all counter types.
245pub const ALL_COUNTERS: [Counter; 6] = [
246    Counter::GraphemeClusters,
247    Counter::Bytes,
248    Counter::Lines,
249    Counter::Words,
250    Counter::CodePoints,
251    Counter::UnicodeWidth,
252];
253
254impl fmt::Display for Counter {
255    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
256        let s = match *self {
257            Counter::GraphemeClusters => "graphemes",
258            Counter::Bytes => "bytes",
259            Counter::Lines => "lines",
260            Counter::Words => "words",
261            Counter::CodePoints => "codepoints",
262            Counter::UnicodeWidth => "unicode-width",
263        };
264
265        write!(f, "{s}")
266    }
267}
268
269/// Counts the given `Counter`s in the given `&str`.
270pub fn uwc_count<'a, I>(counters: I, s: &str) -> Counted
271where
272    I: IntoIterator<Item = &'a Counter>,
273{
274    let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect();
275    if let Some(lines) = counts.get_mut(&Counter::Lines) {
276        if s.is_empty() {
277            // If s is empty, indeed, the count is 0
278            *lines = 0;
279        } else if *lines == 0 && !s.is_empty() {
280            // If s is not empty and the count is 0, it means there
281            // is a line without a line ending, so let's make it 1
282            *lines = 1;
283        } else {
284            // no change, whatever the count is, is right
285        }
286    }
287    counts
288}
289
290#[cfg(test)]
291mod test {
292    use super::*;
293
294    #[test]
295    fn test_examples() {
296        use crate::test_examples;
297
298        test_examples(StrStats {})
299    }
300}
301
302#[test]
303fn test_one_newline() {
304    let s = "\n".to_string();
305    let counts = uwc_count(&ALL_COUNTERS[..], &s);
306    let mut correct_counts = BTreeMap::new();
307    correct_counts.insert(Counter::Lines, 1);
308    correct_counts.insert(Counter::Words, 0);
309    correct_counts.insert(Counter::GraphemeClusters, 1);
310    correct_counts.insert(Counter::Bytes, 1);
311    correct_counts.insert(Counter::CodePoints, 1);
312    correct_counts.insert(Counter::UnicodeWidth, 1);
313
314    assert_eq!(correct_counts, counts);
315}
316
317#[test]
318fn test_count_counts_lines() {
319    // const LF: &str = "\n"; // 0xe0000a
320    // const CR: &str = "\r"; // 0xe0000d
321    // const CRLF: &str = "\r\n"; // 0xe00d0a
322    const NEL: &str = "\u{0085}"; // 0x00c285
323    const FF: &str = "\u{000C}"; // 0x00000c
324    const LS: &str = "\u{2028}"; // 0xe280a8
325    const PS: &str = "\u{2029}"; // 0xe280a9
326
327    // * \r\n is a single grapheme cluster
328    // * trailing newlines are counted
329    // * NEL is 2 bytes
330    // * FF is 1 byte
331    // * LS is 3 bytes
332    // * PS is 3 bytes
333    let mut s = String::from("foo\r\nbar\n\nbaz");
334    s += NEL;
335    s += "quux";
336    s += FF;
337    s += LS;
338    s += "xi";
339    s += PS;
340    s += "\n";
341
342    let counts = uwc_count(&ALL_COUNTERS[..], &s);
343
344    let mut correct_counts = BTreeMap::new();
345    correct_counts.insert(Counter::Lines, 8);
346    correct_counts.insert(Counter::Words, 5);
347    correct_counts.insert(Counter::GraphemeClusters, 23);
348    correct_counts.insert(Counter::Bytes, 29);
349
350    // one more than grapheme clusters because of \r\n
351    correct_counts.insert(Counter::CodePoints, 24);
352    correct_counts.insert(Counter::UnicodeWidth, 23);
353
354    assert_eq!(correct_counts, counts);
355}
356
357#[test]
358fn test_count_counts_words() {
359    let i_can_eat_glass = "Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα.";
360    let s = String::from(i_can_eat_glass);
361
362    let counts = uwc_count(&ALL_COUNTERS[..], &s);
363
364    let mut correct_counts = BTreeMap::new();
365    correct_counts.insert(Counter::GraphemeClusters, 50);
366    correct_counts.insert(Counter::Lines, 1);
367    correct_counts.insert(Counter::Bytes, i_can_eat_glass.len());
368    correct_counts.insert(Counter::Words, 9);
369    correct_counts.insert(Counter::CodePoints, 50);
370    correct_counts.insert(Counter::UnicodeWidth, 50);
371
372    assert_eq!(correct_counts, counts);
373}
374
375#[test]
376fn test_count_counts_codepoints() {
377    // these are NOT the same! One is e + ́́ , and one is é, a single codepoint
378    let one = "é";
379    let two = "é";
380
381    let counters = [Counter::CodePoints];
382
383    let counts = uwc_count(&counters[..], one);
384
385    let mut correct_counts = BTreeMap::new();
386    correct_counts.insert(Counter::CodePoints, 1);
387
388    assert_eq!(correct_counts, counts);
389
390    let counts = uwc_count(&counters[..], two);
391
392    let mut correct_counts = BTreeMap::new();
393    correct_counts.insert(Counter::CodePoints, 2);
394
395    assert_eq!(correct_counts, counts);
396}