tfidf/
lib.rs

1// Copyright 2016 rust-tfidf Developers
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! Library to calculate TF-IDF (Term Frequency - Inverse Document Frequency)
9//! for generic documents. The library provides strategies to act on objects
10//! that implement certain document traits (`NaiveDocument`, `ProcessedDocument`,
11//! `ExpandableDocument`).
12//!
13//! For more information on the strategies that were implemented, check out
14//! [Wikipedia](http://en.wikipedia.org/wiki/Tf%E2%80%93idf).
15//!
16//! # Document Types
17//!
18//! A document is defined as a collection of terms. The documents don't make
19//! assumptions about the term types (the terms are not normalized in any way).
20//!
21//! These document types are of my design. The terminology isn't standard, but
22//! they are fairly straight forward to understand.
23//!
24//!   * `NaiveDocument` - A document is 'naive' if it only knows if a term is
25//!     contained within it or not, but does not know HOW MANY of the instances
26//!     of the term it contains.
27//!
28//!   * `ProcessedDocument` - A document is 'processed' if it knows how many
29//!     instances of each term is contained within it.
30//!
31//!   * `ExpandableDocument` - A document is 'expandable' if provides a way to
32//!     access each term contained within it.
33//!
34//! # Example
35//!
36//! The most simple way to calculate the TfIdf of a document is with the default
37//! implementation. Note, the library provides implementation of
38//! `ProcessedDocument`, for a `Vec<(T, usize)>`.
39//!
40//! ```rust
41//! use tfidf::{TfIdf, TfIdfDefault};
42//!
43//! let mut docs = Vec::new();
44//! let doc1 = vec![("a", 3), ("b", 2), ("c", 4)];
45//! let doc2 = vec![("a", 2), ("d", 5)];
46//!
47//! docs.push(doc1);
48//! docs.push(doc2);
49//!
50//! assert_eq!(0f64, TfIdfDefault::tfidf("a", &docs[0], docs.iter()));
51//! assert!(TfIdfDefault::tfidf("c", &docs[0], docs.iter()) > 0.5);
52//! ```
53//!
54//! You can also roll your own strategies to calculate tf-idf using some strategies
55//! included in the library.
56//!
57//! ```rust
58//! use tfidf::{TfIdf, ProcessedDocument};
59//! use tfidf::tf::{RawFrequencyTf};
60//! use tfidf::idf::{InverseFrequencySmoothIdf};
61//!
62//! #[derive(Copy, Clone)] struct MyTfIdfStrategy;
63//!
64//! impl<T> TfIdf<T> for MyTfIdfStrategy where T : ProcessedDocument {
65//!   type Tf = RawFrequencyTf;
66//!   type Idf = InverseFrequencySmoothIdf;
67//! }
68//!
69//! # let mut docs = Vec::new();
70//! # let doc1 = vec![("a", 3), ("b", 2), ("c", 4)];
71//! # let doc2 = vec![("a", 2), ("d", 5)];
72//!
73//! # docs.push(doc1);
74//! # docs.push(doc2);
75//!
76//! assert!(MyTfIdfStrategy::tfidf("a", &docs[0], docs.iter()) > 0f64);
77//! assert!(MyTfIdfStrategy::tfidf("c", &docs[0], docs.iter()) > 0f64);
78//! ```
79
80#![deny(missing_docs)]
81
82pub use prelude::{
83  Document, ExpandableDocument, NaiveDocument, NormalizationFactor, ProcessedDocument,
84  SmoothingFactor, Tf, TfIdf, Idf,
85};
86
87use std::borrow::Borrow;
88use std::collections::{BTreeMap, HashMap};
89use std::hash::Hash;
90
91mod prelude;
92
93/// Implementations of different weighting schemes for term frequency (tf).
94/// For more information about which ones are implemented, check the Wiki
95/// link in the crate description.
96pub mod tf;
97
98/// Implementations of different weighting schemes for inverse document
99/// frequency (IDF). For more information about which ones are implemented,
100/// check the Wiki link in the crate description.
101pub mod idf;
102
103/// Default scheme for calculating tf-idf.
104#[derive(Copy, Clone)]
105pub struct TfIdfDefault;
106
107impl<T> TfIdf<T> for TfIdfDefault
108where
109  T: ProcessedDocument,
110{
111  type Tf = tf::DoubleHalfNormalizationTf;
112  type Idf = idf::InverseFrequencyIdf;
113}
114
115impl<T> Document for Vec<(T, usize)> {
116  type Term = T;
117}
118
119impl<T> ProcessedDocument for Vec<(T, usize)>
120where
121  T: PartialEq,
122{
123  fn term_frequency<K>(&self, term: K) -> usize
124  where
125    K: Borrow<T>,
126  {
127    match self.iter().find(|&&(ref t, _)| t == term.borrow()) {
128      Some(&(_, c)) => c,
129      None => 0,
130    }
131  }
132
133  fn max(&self) -> Option<&T> {
134    match self.iter().max_by_key(|&&(_, c)| c) {
135      Some(&(ref t, _)) => Some(t),
136      None => None,
137    }
138  }
139}
140
141impl<T> Document for HashMap<T, usize> {
142  type Term = T;
143}
144
145impl<T> ProcessedDocument for HashMap<T, usize>
146where
147  T: Eq + Hash,
148{
149  fn term_frequency<K>(&self, term: K) -> usize
150  where
151    K: Borrow<Self::Term>,
152  {
153    if let Some(v) = self.get(term.borrow()) {
154      *v
155    } else {
156      0
157    }
158  }
159
160  fn max(&self) -> Option<&Self::Term> {
161    self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k)
162  }
163}
164
165impl<T> Document for BTreeMap<T, usize> {
166  type Term = T;
167}
168
169impl<T> ProcessedDocument for BTreeMap<T, usize>
170where
171  T: Ord,
172{
173  fn term_frequency<K>(&self, term: K) -> usize
174  where
175    K: Borrow<Self::Term>,
176  {
177    if let Some(v) = self.get(term.borrow()) {
178      *v
179    } else {
180      0
181    }
182  }
183
184  fn max(&self) -> Option<&Self::Term> {
185    self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k)
186  }
187}
188
189#[test]
190fn tfidf_wiki_example_tests() {
191  let mut docs = Vec::new();
192
193  docs.push(vec![("this", 1), ("is", 1), ("a", 2), ("sample", 1)]);
194  docs.push(vec![("this", 1), ("is", 1), ("another", 2), ("example", 3)]);
195}