freqdist/
lib.rs

1// Copyright 2016 rust-freqdist Developers
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! Implementation of a Frequency Distribution in Rust. Keeps track of how many
9//! times an object appears in a larger context (for example, how many times a
10//! word appears in a piece of text). The underlying data structure of the
11//! Frequency Distribution is a HashMap, so the object that is being counted
12//! must be hashable.
13//!
14//! # Example
15//!
16//! ```
17//! # use freqdist::FrequencyDistribution;
18//! #
19//! let mut fdist: FrequencyDistribution<&str> = FrequencyDistribution::new();
20//!
21//! fdist.insert("hello");
22//! fdist.insert("hello");
23//! fdist.insert("goodbye");
24//!
25//! assert_eq!(fdist.get(&"hello"), 2);
26//! ```
27
28#![warn(missing_docs)]
29#![cfg_attr(test, feature(test))]
30
31#[cfg(test)]
32extern crate test;
33
34use std::ops::Index;
35use std::default::Default;
36use std::hash::{Hasher, Hash, BuildHasher};
37use std::iter::{FromIterator, IntoIterator};
38use std::borrow::Borrow;
39use std::collections::HashMap;
40use std::collections::hash_map::{Keys, IntoIter, Iter, RandomState};
41
42
43static ZERO: usize = 0;
44
45
46#[allow(missing_docs)]
47pub struct FrequencyDistribution<K, S = RandomState> {
48  hashmap: HashMap<K, usize, S>,
49  sum_counts: usize,
50}
51
52impl<K, H, S> FrequencyDistribution<K, S>
53  where K: Eq + Hash,
54        H: Hasher,
55        S: BuildHasher<Hasher = H>
56{
57  /// Creates a new FrequencyDistrbution with a hasher and size, where
58  /// the size is known or can be estimated.
59  #[inline(always)]
60  pub fn with_capacity_and_hasher(size: usize, state: S) -> FrequencyDistribution<K, S> {
61    FrequencyDistribution {
62      hashmap: HashMap::with_capacity_and_hasher(size, state),
63      sum_counts: 0,
64    }
65  }
66
67  /// Creates a new FrequencyDistribution with a hasher and default size.
68  #[inline(always)]
69  pub fn with_hasher(state: S) -> FrequencyDistribution<K, S> {
70    FrequencyDistribution {
71      hashmap: HashMap::with_hasher(state),
72      sum_counts: 0,
73    }
74  }
75
76  /// Iterator over the keys.
77  #[inline(always)]
78  pub fn keys(&self) -> Keys<K, usize> {
79    self.hashmap.keys()
80  }
81
82  /// Iterator over the key, frequency pairs.
83  #[inline(always)]
84  pub fn iter(&self) -> Iter<K, usize> {
85    self.hashmap.iter()
86  }
87
88  /// Iterator over the non-zero frequency keys.
89  ///
90  /// # Example
91  ///
92  /// ```
93  /// # use std::iter::{FromIterator, IntoIterator};
94  /// # use freqdist::FrequencyDistribution;
95  /// #
96  /// let existing = vec![
97  ///   ("shoes", 1),
98  ///   ("scarf", 0),
99  ///   ("shirt", 13),
100  ///   ("pants", 4)
101  /// ];
102  ///
103  /// let fdist: FrequencyDistribution<&str> =
104  ///   FromIterator::from_iter(existing.into_iter());
105  /// let mut iter = fdist.iter_non_zero();
106  ///
107  /// assert!(iter.next().is_some());
108  /// assert!(iter.next().is_some());
109  /// assert!(iter.next().is_some());
110  /// assert!(iter.next().is_none());
111  /// ```
112  #[inline(always)]
113  pub fn iter_non_zero(&self) -> NonZeroKeysIter<K> {
114    NonZeroKeysIter { iter: self.iter() }
115  }
116
117  /// Sum of the total number of items counted thus far.
118  #[inline(always)]
119  pub fn sum_counts(&self) -> usize {
120    self.sum_counts
121  }
122
123  /// Returns the number of entries in the distribution
124  #[inline(always)]
125  pub fn len(&self) -> usize {
126    self.hashmap.len()
127  }
128
129  /// Gets the frequency in which the key occurs.
130  #[inline(always)]
131  pub fn get<Q: ?Sized>(&self, k: &Q) -> usize
132    where K: Borrow<Q>,
133          Q: Hash + Eq
134  {
135    self[k]
136  }
137
138  /// Clears the counts of all keys and clears all keys from
139  /// the distribution.
140  #[inline(always)]
141  pub fn clear(&mut self) {
142    self.hashmap.clear()
143  }
144
145  /// Updates the frequency of the value found with the key if it
146  /// already exists. Otherwise, inserts the key sizeo the hashmap,
147  /// and sets its frequency to 1.
148  #[inline(always)]
149  pub fn insert(&mut self, k: K) {
150    self.insert_or_incr_by(k, 1);
151  }
152
153  /// Removes an item and its associated counts.
154  #[inline(always)]
155  pub fn remove<Q: ?Sized>(&mut self, k: &Q)
156    where K: Borrow<Q>,
157          Q: Hash + Eq
158  {
159    match self.hashmap.remove(k) {
160      Some(count) => self.sum_counts -= count,
161      None => (),
162    }
163  }
164
165  /// Inserts a value sizeo the hashmap if it does not exist with a new quantity
166  /// specified by the increment. If the value already exists, increments by
167  /// the specified amount.
168  #[inline]
169  fn insert_or_incr_by(&mut self, k: K, incr: usize) {
170    if !self.hashmap.contains_key(&k) {
171      self.hashmap.insert(k, incr);
172    } else {
173      *self.hashmap.get_mut(&k).unwrap() += incr;
174    }
175
176    self.sum_counts += incr;
177  }
178}
179
180impl<K, H, S> FrequencyDistribution<K, S>
181  where K: Eq + Hash,
182        H: Hasher + Default,
183        S: BuildHasher<Hasher = H> + Default
184{
185  /// Creates a new FrequencyDistribution where the size of the
186  /// HashMap is unknown.
187  #[inline(always)]
188  pub fn new() -> FrequencyDistribution<K, S> {
189    FrequencyDistribution::with_hasher(Default::default())
190  }
191
192  /// Creates a new FrequencyDistribution where the size of the HashMap
193  /// is known, or a estimate can be made.
194  #[inline(always)]
195  pub fn with_capacity(size: usize) -> FrequencyDistribution<K, S> {
196    FrequencyDistribution::with_capacity_and_hasher(size, Default::default())
197  }
198}
199
200impl<K, H, S> Default for FrequencyDistribution<K, S>
201  where K: Eq + Hash,
202        H: Hasher + Default,
203        S: BuildHasher<Hasher = H> + Default
204{
205  /// Creates a default FrequencyDistribution.
206  #[inline(always)]
207  fn default() -> FrequencyDistribution<K, S> {
208    FrequencyDistribution::new()
209  }
210}
211
212impl<K, H, S> FromIterator<(K, usize)> for FrequencyDistribution<K, S>
213  where K: Eq + Hash,
214        H: Hasher,
215        S: BuildHasher<Hasher = H> + Default
216{
217  /// Iterates through an iterator, and creates a new FrequencyDistribution from
218  /// it. The iterator should be an iterator over keys and frequencies. If a
219  /// upper bounded `size_hsize` is available, then it is used, otherwise the lower
220  /// bounded `size_hsize` is used.
221  ///
222  /// # Example
223  ///
224  /// ```
225  /// # use std::iter::FromIterator;
226  /// # use freqdist::FrequencyDistribution;
227  /// #
228  /// let existing = vec![
229  ///   ("apples", 3),
230  ///   ("oranges", 4),
231  ///   ("bannana", 7)
232  /// ];
233  ///
234  /// let fdist: FrequencyDistribution<&str> =
235  ///   FromIterator::from_iter(existing.into_iter());
236  ///
237  /// assert_eq!(fdist.get(&"apples"), 3);
238  /// assert_eq!(fdist.get(&"oranges"), 4);
239  /// assert_eq!(fdist.get(&"bannana"), 7);
240  /// ```
241  fn from_iter<T>(iter: T) -> FrequencyDistribution<K, S>
242    where T: IntoIterator<Item = (K, usize)>
243  {
244    let iterator = iter.into_iter();
245    let mut fdist = if iterator.size_hint().1.is_some() {
246      FrequencyDistribution::with_capacity_and_hasher(iterator.size_hint().1.unwrap(),
247                                                      Default::default())
248    } else {
249      FrequencyDistribution::with_capacity_and_hasher(iterator.size_hint().0, Default::default())
250    };
251
252    for (k, freq) in iterator {
253      fdist.insert_or_incr_by(k, freq);
254    }
255
256    fdist
257  }
258}
259
260impl<K, H, S> Extend<(K, usize)> for FrequencyDistribution<K, S>
261  where K: Eq + Hash,
262        H: Hasher,
263        S: BuildHasher<Hasher = H>
264{
265  /// Extends the hashmap by adding the keys or updating the frequencies of the keys.
266  fn extend<T>(&mut self, iter: T)
267    where T: IntoIterator<Item = (K, usize)>
268  {
269    for (k, freq) in iter.into_iter() {
270      self.insert_or_incr_by(k, freq);
271    }
272  }
273}
274
275impl<K, H, S> IntoIterator for FrequencyDistribution<K, S>
276  where K: Eq + Hash,
277        H: Hasher,
278        S: BuildHasher<Hasher = H>
279{
280  type Item = (K, usize);
281  type IntoIter = IntoIter<K, usize>;
282
283  /// Consumes the distribution, and creates an iterator over the
284  /// (Key, Quantity: usize) pairs.
285  #[inline]
286  fn into_iter(self) -> IntoIter<K, usize> {
287    self.hashmap.into_iter()
288  }
289}
290
291impl<'a, K, H, S, Q: ?Sized> Index<&'a Q> for FrequencyDistribution<K, S>
292  where K: Eq + Hash + Borrow<Q>,
293        H: Hasher,
294        S: BuildHasher<Hasher = H>,
295        Q: Eq + Hash
296{
297  type Output = usize;
298
299  #[inline]
300  fn index<'b>(&'b self, index: &Q) -> &'b usize {
301    self.hashmap.get(index).unwrap_or(&ZERO)
302  }
303}
304
305/// Iterator over entries with non-zero quantities.
306pub struct NonZeroKeysIter<'a, K: 'a> {
307  iter: Iter<'a, K, usize>,
308}
309
310impl<'a, K: 'a> Iterator for NonZeroKeysIter<'a, K> {
311  type Item = &'a K;
312
313  #[inline(always)]
314  fn next(&mut self) -> Option<&'a K> {
315    loop {
316      match self.iter.next() {
317        Some((k, c)) if *c > 0 => return Some(k),
318        None => return None,
319        _ => (),
320      }
321    }
322  }
323}
324
325#[test]
326fn smoke_test_frequency_distribution_insert() {
327  let words = vec!["alpha", "beta"];
328  let mut dist: FrequencyDistribution<&str> = FrequencyDistribution::new();
329
330  dist.insert(words[0]);
331
332  assert_eq!(dist.get(&words[0]), 1);
333
334  dist.insert(words[1]);
335
336  assert_eq!(dist.get(&words[1]), 1);
337
338  for _ in 0..7u32 {
339    dist.insert(words[0]);
340  }
341
342  assert_eq!(dist.get(&words[0]), 8);
343}
344
345#[test]
346fn smoke_test_frequency_distribution_iter() {
347  let words = vec![("a", 50usize), ("b", 100usize), ("c", 75usize), ("d", 0usize)];
348  let dist: FrequencyDistribution<&str> = FromIterator::from_iter(words.into_iter());
349
350  assert_eq!(dist.get(&"a"), 50);
351  assert_eq!(dist.get(&"b"), 100);
352  assert_eq!(dist.get(&"c"), 75);
353
354  let mut iter = dist.iter_non_zero();
355
356  assert!(iter.next().is_some());
357  assert!(iter.next().is_some());
358  assert!(iter.next().is_some());
359  assert!(iter.next().is_none());
360
361  assert_eq!(dist.sum_counts(), 225);
362}
363
364#[test]
365fn smoke_test_frequency_distribution_remove() {
366  let words = vec![("a", 50usize), ("b", 100usize), ("c", 25usize)];
367  let mut dist: FrequencyDistribution<&str> = FromIterator::from_iter(words.into_iter());
368
369  assert_eq!(dist.get(&"a"), 50);
370
371  dist.remove(&"a");
372
373  assert_eq!(dist.get(&"a"), 0);
374  assert_eq!(dist.sum_counts(), 125);
375}
376
377#[test]
378fn smoke_test_frequency_sum_counts() {
379  let words = vec![("a", 7usize), ("b", 5usize), ("c", 8usize), ("d", 3usize)];
380  let mut dist: FrequencyDistribution<&str> = FromIterator::from_iter(words.into_iter());
381
382  assert_eq!(dist.sum_counts(), 23);
383
384  dist.insert("e");
385
386  assert_eq!(dist.sum_counts(), 24);
387}