words_count/
lib.rs

1/*!
2# Words Count
3
4Count the words and characters, with or without whitespaces.
5
6The algorithm is roughly aligned with the way LibreOffice is counting words. This means that it does not exactly match the [Unicode Text Segmentation](https://unicode.org/reports/tr29/#Word_Boundaries) standard.
7
8## Examples
9
10```rust
11use words_count::WordsCount;
12
13assert_eq!(WordsCount {
14    words: 20,
15    characters: 31,
16    whitespaces: 2,
17    cjk: 18,
18}, words_count::count("Rust是由 Mozilla 主導開發的通用、編譯型程式語言。"));
19```
20
21```rust
22let result = words_count::count_separately("apple banana apple");
23
24assert_eq!(2, result.len());
25assert_eq!(Some(&2), result.get("apple"));
26```
27*/
28
29#![no_std]
30
31extern crate alloc;
32
33use alloc::collections::BTreeMap;
34use core::{
35    ops::{Add, AddAssign},
36    str::from_utf8_unchecked,
37};
38
39#[derive(Debug, Clone, Default, Eq, PartialEq)]
40pub struct WordsCount {
41    pub words:       usize,
42    pub characters:  usize,
43    pub whitespaces: usize,
44    pub cjk:         usize,
45}
46
47/// A WordsCount equivalent to words_count::count("\n").
48///
49/// It is useful when processing files a line at a time.
50///
51/// ## Example
52///
53/// ```rust
54/// use words_count::{count, WordsCount, NEWLINE};
55///
56/// let mut total = WordsCount::default();
57/// for ln in std::io::stdin().lines() {
58///     total += count(ln.unwrap()) + NEWLINE;
59/// }
60/// println!("{total:?}");
61/// ```
62pub const NEWLINE: WordsCount =
63    WordsCount {
64        words: 0, characters: 1, whitespaces: 1, cjk: 0
65    };
66
67impl AddAssign for WordsCount {
68    #[inline]
69    fn add_assign(&mut self, other: Self) {
70        *self = Self {
71            words:       self.words + other.words,
72            characters:  self.characters + other.characters,
73            whitespaces: self.whitespaces + other.whitespaces,
74            cjk:         self.cjk + other.cjk,
75        }
76    }
77}
78
79impl Add for WordsCount {
80    type Output = Self;
81
82    #[inline]
83    fn add(mut self, other: Self) -> Self {
84        self += other;
85        self
86    }
87}
88
89/// Count the words in the given string. In general, every non-CJK string of characters between two whitespaces is a word. Dashes (at least two dashes) are word limit, too. A CJK character is considered to be an independent word.
90pub fn count<S: AsRef<str>>(s: S) -> WordsCount {
91    let mut in_word = false;
92    let mut consecutive_dashes = 0usize;
93
94    let mut count = WordsCount::default();
95
96    for c in s.as_ref().chars() {
97        count.characters += 1;
98
99        if c.is_whitespace() {
100            consecutive_dashes = 0;
101
102            count.whitespaces += 1;
103
104            if in_word {
105                count.words += 1;
106
107                in_word = false;
108            }
109        } else {
110            match c {
111                '-' => {
112                    consecutive_dashes += 1;
113
114                    if consecutive_dashes > 1 && in_word {
115                        if consecutive_dashes == 2 {
116                            count.words += 1;
117                        }
118
119                        in_word = false;
120
121                        continue;
122                    }
123                },
124                _ => {
125                    consecutive_dashes = 0;
126
127                    if unicode_blocks::is_cjk(c) {
128                        count.words += 1;
129                        count.cjk += 1;
130
131                        if in_word {
132                            count.words += 1;
133
134                            in_word = false;
135                        }
136
137                        continue;
138                    }
139                },
140            }
141
142            if !in_word {
143                in_word = true;
144            }
145        }
146    }
147
148    if in_word {
149        count.words += 1;
150    }
151
152    count
153}
154
155/// Count the words separately in the given string. In general, every non-CJK string of characters between two whitespaces is a word. Dashes (at least two dashes) are word limit, too. A CJK character is considered to be an independent word. Punctuations are not handled.
156pub fn count_separately<S: ?Sized + AsRef<str>>(s: &S) -> BTreeMap<&str, usize> {
157    let mut in_word = false;
158    let mut consecutive_dashes = 0usize;
159
160    let mut count = BTreeMap::new();
161
162    let mut p = 0;
163    let mut pp = 0;
164
165    let s = s.as_ref();
166    let bytes = s.as_bytes();
167
168    for c in s.chars() {
169        let cl = c.len_utf8();
170
171        if c.is_whitespace() {
172            if in_word {
173                inc_or_insert(&mut count, unsafe { from_utf8_unchecked(&bytes[p..pp]) });
174
175                in_word = false;
176            }
177
178            p = pp + cl;
179
180            consecutive_dashes = 0;
181        } else {
182            match c {
183                '-' => {
184                    consecutive_dashes += 1;
185
186                    if consecutive_dashes > 1 {
187                        if in_word {
188                            if consecutive_dashes == 2 {
189                                inc_or_insert(&mut count, unsafe {
190                                    from_utf8_unchecked(&bytes[p..(pp - 1)])
191                                });
192                            }
193
194                            in_word = false;
195
196                            pp += cl;
197                            p = pp;
198                            continue;
199                        } else {
200                            p = pp + cl;
201                        }
202                    }
203                },
204                _ => {
205                    if unicode_blocks::is_cjk(c) {
206                        inc_or_insert(&mut count, unsafe {
207                            from_utf8_unchecked(&bytes[pp..(pp + cl)])
208                        });
209
210                        if in_word {
211                            inc_or_insert(&mut count, unsafe {
212                                from_utf8_unchecked(&bytes[p..pp])
213                            });
214
215                            in_word = false;
216                        }
217
218                        consecutive_dashes = 0;
219                        pp += cl;
220                        p = pp;
221                        continue;
222                    }
223
224                    consecutive_dashes = 0;
225                },
226            }
227
228            if !in_word {
229                in_word = true;
230            }
231        }
232
233        pp += cl;
234    }
235
236    if in_word {
237        inc_or_insert(&mut count, unsafe { from_utf8_unchecked(&bytes[p..pp]) });
238    }
239
240    count
241}
242
243#[inline]
244fn inc_or_insert<'a>(map: &mut BTreeMap<&'a str, usize>, s: &'a str) {
245    if let Some(count) = map.get_mut(s) {
246        *count += 1;
247    } else {
248        map.insert(s, 1);
249    }
250}