1use fancy_regex::Regex;
2use nu_engine::command_prelude::*;
3
4use std::collections::BTreeMap;
5use std::{fmt, str};
6use unicode_segmentation::UnicodeSegmentation;
7
8pub type Counted = BTreeMap<Counter, usize>;
10
11#[derive(Clone)]
12pub struct StrStats;
13
14impl Command for StrStats {
15 fn name(&self) -> &str {
16 "str stats"
17 }
18
19 fn signature(&self) -> Signature {
20 Signature::build("str stats")
21 .category(Category::Strings)
22 .input_output_types(vec![(Type::String, Type::record())])
23 }
24
25 fn description(&self) -> &str {
26 "Gather word count statistics on the text."
27 }
28
29 fn search_terms(&self) -> Vec<&str> {
30 vec!["count", "word", "character", "unicode", "wc"]
31 }
32
33 fn is_const(&self) -> bool {
34 true
35 }
36
37 fn run(
38 &self,
39 engine_state: &EngineState,
40 _stack: &mut Stack,
41 call: &Call,
42 input: PipelineData,
43 ) -> Result<PipelineData, ShellError> {
44 stats(engine_state, call, input)
45 }
46
47 fn run_const(
48 &self,
49 working_set: &StateWorkingSet,
50 call: &Call,
51 input: PipelineData,
52 ) -> Result<PipelineData, ShellError> {
53 stats(working_set.permanent(), call, input)
54 }
55
56 fn examples(&self) -> Vec<Example<'_>> {
57 vec![
58 Example {
59 description: "Count the number of words in a string",
60 example: r#""There are seven words in this sentence" | str stats"#,
61 result: Some(Value::test_record(record! {
62 "lines" => Value::test_int(1),
63 "words" => Value::test_int(7),
64 "bytes" => Value::test_int(38),
65 "chars" => Value::test_int(38),
66 "graphemes" => Value::test_int(38),
67 "unicode-width" => Value::test_int(38),
68 })),
69 },
70 Example {
71 description: "Counts unicode characters",
72 example: r#"'今天天气真好' | str stats"#,
73 result: Some(Value::test_record(record! {
74 "lines" => Value::test_int(1),
75 "words" => Value::test_int(6),
76 "bytes" => Value::test_int(18),
77 "chars" => Value::test_int(6),
78 "graphemes" => Value::test_int(6),
79 "unicode-width" => Value::test_int(12),
80 })),
81 },
82 Example {
83 description: "Counts Unicode characters correctly in a string",
84 example: r#""Amélie Amelie" | str stats"#,
85 result: Some(Value::test_record(record! {
86 "lines" => Value::test_int(1),
87 "words" => Value::test_int(2),
88 "bytes" => Value::test_int(15),
89 "chars" => Value::test_int(14),
90 "graphemes" => Value::test_int(13),
91 "unicode-width" => Value::test_int(13),
92 })),
93 },
94 ]
95 }
96}
97
98fn stats(
99 engine_state: &EngineState,
100 call: &Call,
101 input: PipelineData,
102) -> Result<PipelineData, ShellError> {
103 let span = call.head;
104 if let PipelineData::Empty = input {
106 return Err(ShellError::PipelineEmpty { dst_span: span });
107 }
108 input.map(
109 move |v| {
110 let value_span = v.span();
111 let type_ = v.get_type();
112 if let Value::Error { error, .. } = v {
114 return Value::error(*error, span);
115 }
116 match v.coerce_into_string() {
118 Ok(s) => counter(&s, span),
119 Err(_) => Value::error(
120 ShellError::OnlySupportsThisInputType {
121 exp_input_type: "string".into(),
122 wrong_type: type_.to_string(),
123 dst_span: span,
124 src_span: value_span,
125 },
126 span,
127 ),
128 }
129 },
130 engine_state.signals(),
131 )
132}
133
134fn counter(contents: &str, span: Span) -> Value {
135 let counts = uwc_count(&ALL_COUNTERS[..], contents);
136
137 fn get_count(counts: &BTreeMap<Counter, usize>, counter: Counter, span: Span) -> Value {
138 Value::int(counts.get(&counter).copied().unwrap_or(0) as i64, span)
139 }
140
141 let record = record! {
142 "lines" => get_count(&counts, Counter::Lines, span),
143 "words" => get_count(&counts, Counter::Words, span),
144 "bytes" => get_count(&counts, Counter::Bytes, span),
145 "chars" => get_count(&counts, Counter::CodePoints, span),
146 "graphemes" => get_count(&counts, Counter::GraphemeClusters, span),
147 "unicode-width" => get_count(&counts, Counter::UnicodeWidth, span),
148 };
149
150 Value::record(record, span)
151}
152
153pub trait Count {
175 fn count(&self, s: &str) -> usize;
177}
178
179impl Count for Counter {
180 fn count(&self, s: &str) -> usize {
181 match *self {
182 Counter::GraphemeClusters => s.graphemes(true).count(),
183 Counter::Bytes => s.len(),
184 Counter::Lines => {
185 const LF: &str = "\n"; const CR: &str = "\r"; const CRLF: &str = "\r\n"; const NEL: &str = "\u{0085}"; const FF: &str = "\u{000C}"; const LS: &str = "\u{2028}"; const PS: &str = "\u{2029}"; let line_ending_types = [CRLF, LF, CR, NEL, FF, LS, PS];
195 let pattern = &line_ending_types.join("|");
196 let newline_pattern = Regex::new(pattern).expect("Unable to create regex");
197 let line_endings = newline_pattern
198 .find_iter(s)
199 .map(|f| match f {
200 Ok(mat) => mat.as_str().to_string(),
201 Err(_) => "".to_string(),
202 })
203 .collect::<Vec<String>>();
204
205 let has_line_ending_suffix =
206 line_ending_types.iter().any(|&suffix| s.ends_with(suffix));
207 if has_line_ending_suffix {
210 line_endings.len()
211 } else {
212 line_endings.len() + 1
213 }
214 }
215 Counter::Words => s.unicode_words().count(),
216 Counter::CodePoints => s.chars().count(),
217 Counter::UnicodeWidth => unicode_width::UnicodeWidthStr::width(s),
218 }
219 }
220}
221
222#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)]
224pub enum Counter {
225 Lines,
227
228 Words,
230
231 Bytes,
233
234 GraphemeClusters,
236
237 CodePoints,
239
240 UnicodeWidth,
242}
243
244pub const ALL_COUNTERS: [Counter; 6] = [
246 Counter::GraphemeClusters,
247 Counter::Bytes,
248 Counter::Lines,
249 Counter::Words,
250 Counter::CodePoints,
251 Counter::UnicodeWidth,
252];
253
254impl fmt::Display for Counter {
255 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
256 let s = match *self {
257 Counter::GraphemeClusters => "graphemes",
258 Counter::Bytes => "bytes",
259 Counter::Lines => "lines",
260 Counter::Words => "words",
261 Counter::CodePoints => "codepoints",
262 Counter::UnicodeWidth => "unicode-width",
263 };
264
265 write!(f, "{s}")
266 }
267}
268
269pub fn uwc_count<'a, I>(counters: I, s: &str) -> Counted
271where
272 I: IntoIterator<Item = &'a Counter>,
273{
274 let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect();
275 if let Some(lines) = counts.get_mut(&Counter::Lines) {
276 if s.is_empty() {
277 *lines = 0;
279 } else if *lines == 0 && !s.is_empty() {
280 *lines = 1;
283 } else {
284 }
286 }
287 counts
288}
289
290#[cfg(test)]
291mod test {
292 use super::*;
293
294 #[test]
295 fn test_examples() {
296 use crate::test_examples;
297
298 test_examples(StrStats {})
299 }
300}
301
302#[test]
303fn test_one_newline() {
304 let s = "\n".to_string();
305 let counts = uwc_count(&ALL_COUNTERS[..], &s);
306 let mut correct_counts = BTreeMap::new();
307 correct_counts.insert(Counter::Lines, 1);
308 correct_counts.insert(Counter::Words, 0);
309 correct_counts.insert(Counter::GraphemeClusters, 1);
310 correct_counts.insert(Counter::Bytes, 1);
311 correct_counts.insert(Counter::CodePoints, 1);
312 correct_counts.insert(Counter::UnicodeWidth, 1);
313
314 assert_eq!(correct_counts, counts);
315}
316
317#[test]
318fn test_count_counts_lines() {
319 const NEL: &str = "\u{0085}"; const FF: &str = "\u{000C}"; const LS: &str = "\u{2028}"; const PS: &str = "\u{2029}"; let mut s = String::from("foo\r\nbar\n\nbaz");
334 s += NEL;
335 s += "quux";
336 s += FF;
337 s += LS;
338 s += "xi";
339 s += PS;
340 s += "\n";
341
342 let counts = uwc_count(&ALL_COUNTERS[..], &s);
343
344 let mut correct_counts = BTreeMap::new();
345 correct_counts.insert(Counter::Lines, 8);
346 correct_counts.insert(Counter::Words, 5);
347 correct_counts.insert(Counter::GraphemeClusters, 23);
348 correct_counts.insert(Counter::Bytes, 29);
349
350 correct_counts.insert(Counter::CodePoints, 24);
352 correct_counts.insert(Counter::UnicodeWidth, 23);
353
354 assert_eq!(correct_counts, counts);
355}
356
357#[test]
358fn test_count_counts_words() {
359 let i_can_eat_glass = "Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα.";
360 let s = String::from(i_can_eat_glass);
361
362 let counts = uwc_count(&ALL_COUNTERS[..], &s);
363
364 let mut correct_counts = BTreeMap::new();
365 correct_counts.insert(Counter::GraphemeClusters, 50);
366 correct_counts.insert(Counter::Lines, 1);
367 correct_counts.insert(Counter::Bytes, i_can_eat_glass.len());
368 correct_counts.insert(Counter::Words, 9);
369 correct_counts.insert(Counter::CodePoints, 50);
370 correct_counts.insert(Counter::UnicodeWidth, 50);
371
372 assert_eq!(correct_counts, counts);
373}
374
375#[test]
376fn test_count_counts_codepoints() {
377 let one = "é";
379 let two = "é";
380
381 let counters = [Counter::CodePoints];
382
383 let counts = uwc_count(&counters[..], one);
384
385 let mut correct_counts = BTreeMap::new();
386 correct_counts.insert(Counter::CodePoints, 1);
387
388 assert_eq!(correct_counts, counts);
389
390 let counts = uwc_count(&counters[..], two);
391
392 let mut correct_counts = BTreeMap::new();
393 correct_counts.insert(Counter::CodePoints, 2);
394
395 assert_eq!(correct_counts, counts);
396}