1use fancy_regex::Regex;
2use nu_engine::command_prelude::*;
3
4use std::collections::BTreeMap;
5use std::{fmt, str};
6use unicode_segmentation::UnicodeSegmentation;
7
8pub type Counted = BTreeMap<Counter, usize>;
10
11#[derive(Clone)]
12pub struct StrStats;
13
14impl Command for StrStats {
15 fn name(&self) -> &str {
16 "str stats"
17 }
18
19 fn signature(&self) -> Signature {
20 Signature::build("str stats")
21 .category(Category::Strings)
22 .input_output_types(vec![(Type::String, Type::record())])
23 }
24
25 fn description(&self) -> &str {
26 "Gather word count statistics on the text."
27 }
28
29 fn search_terms(&self) -> Vec<&str> {
30 vec!["count", "word", "character", "unicode", "wc"]
31 }
32
33 fn is_const(&self) -> bool {
34 true
35 }
36
37 fn run(
38 &self,
39 engine_state: &EngineState,
40 _stack: &mut Stack,
41 call: &Call,
42 input: PipelineData,
43 ) -> Result<PipelineData, ShellError> {
44 stats(engine_state, call, input)
45 }
46
47 fn run_const(
48 &self,
49 working_set: &StateWorkingSet,
50 call: &Call,
51 input: PipelineData,
52 ) -> Result<PipelineData, ShellError> {
53 stats(working_set.permanent(), call, input)
54 }
55
56 fn examples(&self) -> Vec<Example<'_>> {
57 vec![
58 Example {
59 description: "Count the number of words in a string.",
60 example: r#""There are seven words in this sentence" | str stats"#,
61 result: Some(Value::test_record(record! {
62 "lines" => Value::test_int(1),
63 "words" => Value::test_int(7),
64 "bytes" => Value::test_int(38),
65 "chars" => Value::test_int(38),
66 "graphemes" => Value::test_int(38),
67 "unicode-width" => Value::test_int(38),
68 })),
69 },
70 Example {
71 description: "Counts unicode characters.",
72 example: "'今天天气真好' | str stats",
73 result: Some(Value::test_record(record! {
74 "lines" => Value::test_int(1),
75 "words" => Value::test_int(6),
76 "bytes" => Value::test_int(18),
77 "chars" => Value::test_int(6),
78 "graphemes" => Value::test_int(6),
79 "unicode-width" => Value::test_int(12),
80 })),
81 },
82 Example {
83 description: "Counts Unicode characters correctly in a string.",
84 example: r#""Amélie Amelie" | str stats"#,
85 result: Some(Value::test_record(record! {
86 "lines" => Value::test_int(1),
87 "words" => Value::test_int(2),
88 "bytes" => Value::test_int(15),
89 "chars" => Value::test_int(14),
90 "graphemes" => Value::test_int(13),
91 "unicode-width" => Value::test_int(13),
92 })),
93 },
94 ]
95 }
96}
97
98fn stats(
99 engine_state: &EngineState,
100 call: &Call,
101 input: PipelineData,
102) -> Result<PipelineData, ShellError> {
103 let span = call.head;
104 if let PipelineData::Empty = input {
106 return Err(ShellError::PipelineEmpty { dst_span: span });
107 }
108 input.map(
109 move |v| {
110 let value_span = v.span();
111 let type_ = v.get_type();
112 if let Value::Error { error, .. } = v {
114 return Value::error(*error, span);
115 }
116 match v.coerce_into_string() {
118 Ok(s) => counter(&s, span),
119 Err(_) => Value::error(
120 ShellError::OnlySupportsThisInputType {
121 exp_input_type: "string".into(),
122 wrong_type: type_.to_string(),
123 dst_span: span,
124 src_span: value_span,
125 },
126 span,
127 ),
128 }
129 },
130 engine_state.signals(),
131 )
132}
133
134fn counter(contents: &str, span: Span) -> Value {
135 let counts = uwc_count(&ALL_COUNTERS[..], contents);
136
137 fn get_count(counts: &BTreeMap<Counter, usize>, counter: Counter, span: Span) -> Value {
138 Value::int(counts.get(&counter).copied().unwrap_or(0) as i64, span)
139 }
140
141 let record = record! {
142 "lines" => get_count(&counts, Counter::Lines, span),
143 "words" => get_count(&counts, Counter::Words, span),
144 "bytes" => get_count(&counts, Counter::Bytes, span),
145 "chars" => get_count(&counts, Counter::CodePoints, span),
146 "graphemes" => get_count(&counts, Counter::GraphemeClusters, span),
147 "unicode-width" => get_count(&counts, Counter::UnicodeWidth, span),
148 };
149
150 Value::record(record, span)
151}
152
153pub trait Count {
175 fn count(&self, s: &str) -> usize;
177}
178
179impl Count for Counter {
180 fn count(&self, s: &str) -> usize {
181 match *self {
182 Counter::GraphemeClusters => s.graphemes(true).count(),
183 Counter::Bytes => s.len(),
184 Counter::Lines => {
185 const LF: &str = "\n"; const CR: &str = "\r"; const CRLF: &str = "\r\n"; const NEL: &str = "\u{0085}"; const FF: &str = "\u{000C}"; const LS: &str = "\u{2028}"; const PS: &str = "\u{2029}"; let line_ending_types = [CRLF, LF, CR, NEL, FF, LS, PS];
195 let pattern = &line_ending_types.join("|");
196 let newline_pattern = Regex::new(pattern).expect("Unable to create regex");
197 let line_endings = newline_pattern
198 .find_iter(s)
199 .map(|f| match f {
200 Ok(mat) => mat.as_str().to_string(),
201 Err(_) => "".to_string(),
202 })
203 .collect::<Vec<String>>();
204
205 let has_line_ending_suffix =
206 line_ending_types.iter().any(|&suffix| s.ends_with(suffix));
207 if has_line_ending_suffix {
210 line_endings.len()
211 } else {
212 line_endings.len() + 1
213 }
214 }
215 Counter::Words => s.unicode_words().count(),
216 Counter::CodePoints => s.chars().count(),
217 Counter::UnicodeWidth => unicode_width::UnicodeWidthStr::width(s),
218 }
219 }
220}
221
222#[derive(Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Copy, Clone)]
224pub enum Counter {
225 Lines,
227
228 Words,
230
231 Bytes,
233
234 GraphemeClusters,
236
237 CodePoints,
239
240 UnicodeWidth,
242}
243
244pub const ALL_COUNTERS: [Counter; 6] = [
246 Counter::GraphemeClusters,
247 Counter::Bytes,
248 Counter::Lines,
249 Counter::Words,
250 Counter::CodePoints,
251 Counter::UnicodeWidth,
252];
253
254impl fmt::Display for Counter {
255 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
256 let s = match *self {
257 Counter::GraphemeClusters => "graphemes",
258 Counter::Bytes => "bytes",
259 Counter::Lines => "lines",
260 Counter::Words => "words",
261 Counter::CodePoints => "codepoints",
262 Counter::UnicodeWidth => "unicode-width",
263 };
264
265 write!(f, "{s}")
266 }
267}
268
269pub fn uwc_count<'a, I>(counters: I, s: &str) -> Counted
271where
272 I: IntoIterator<Item = &'a Counter>,
273{
274 let mut counts: Counted = counters.into_iter().map(|c| (*c, c.count(s))).collect();
275 if let Some(lines) = counts.get_mut(&Counter::Lines) {
276 if s.is_empty() {
277 *lines = 0;
279 } else if *lines == 0 && !s.is_empty() {
280 *lines = 1;
283 } else {
284 }
286 }
287 counts
288}
289
290#[cfg(test)]
291mod test {
292 use super::*;
293
294 #[test]
295 fn test_examples() -> nu_test_support::Result {
296 nu_test_support::test().examples(StrStats)
297 }
298}
299
300#[test]
301fn test_one_newline() {
302 let s = "\n".to_string();
303 let counts = uwc_count(&ALL_COUNTERS[..], &s);
304 let mut correct_counts = BTreeMap::new();
305 correct_counts.insert(Counter::Lines, 1);
306 correct_counts.insert(Counter::Words, 0);
307 correct_counts.insert(Counter::GraphemeClusters, 1);
308 correct_counts.insert(Counter::Bytes, 1);
309 correct_counts.insert(Counter::CodePoints, 1);
310 correct_counts.insert(Counter::UnicodeWidth, 1);
311
312 assert_eq!(correct_counts, counts);
313}
314
315#[test]
316fn test_count_counts_lines() {
317 const NEL: &str = "\u{0085}"; const FF: &str = "\u{000C}"; const LS: &str = "\u{2028}"; const PS: &str = "\u{2029}"; let mut s = String::from("foo\r\nbar\n\nbaz");
332 s += NEL;
333 s += "quux";
334 s += FF;
335 s += LS;
336 s += "xi";
337 s += PS;
338 s += "\n";
339
340 let counts = uwc_count(&ALL_COUNTERS[..], &s);
341
342 let mut correct_counts = BTreeMap::new();
343 correct_counts.insert(Counter::Lines, 8);
344 correct_counts.insert(Counter::Words, 5);
345 correct_counts.insert(Counter::GraphemeClusters, 23);
346 correct_counts.insert(Counter::Bytes, 29);
347
348 correct_counts.insert(Counter::CodePoints, 24);
350 correct_counts.insert(Counter::UnicodeWidth, 23);
351
352 assert_eq!(correct_counts, counts);
353}
354
355#[test]
356fn test_count_counts_words() {
357 let i_can_eat_glass = "Μπορῶ νὰ φάω σπασμένα γυαλιὰ χωρὶς νὰ πάθω τίποτα.";
358 let s = String::from(i_can_eat_glass);
359
360 let counts = uwc_count(&ALL_COUNTERS[..], &s);
361
362 let mut correct_counts = BTreeMap::new();
363 correct_counts.insert(Counter::GraphemeClusters, 50);
364 correct_counts.insert(Counter::Lines, 1);
365 correct_counts.insert(Counter::Bytes, i_can_eat_glass.len());
366 correct_counts.insert(Counter::Words, 9);
367 correct_counts.insert(Counter::CodePoints, 50);
368 correct_counts.insert(Counter::UnicodeWidth, 50);
369
370 assert_eq!(correct_counts, counts);
371}
372
373#[test]
374fn test_count_counts_codepoints() {
375 let one = "é";
377 let two = "é";
378
379 let counters = [Counter::CodePoints];
380
381 let counts = uwc_count(&counters[..], one);
382
383 let mut correct_counts = BTreeMap::new();
384 correct_counts.insert(Counter::CodePoints, 1);
385
386 assert_eq!(correct_counts, counts);
387
388 let counts = uwc_count(&counters[..], two);
389
390 let mut correct_counts = BTreeMap::new();
391 correct_counts.insert(Counter::CodePoints, 2);
392
393 assert_eq!(correct_counts, counts);
394}