tfidf/lib.rs
1// Copyright 2016 rust-tfidf Developers
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8//! Library to calculate TF-IDF (Term Frequency - Inverse Document Frequency)
9//! for generic documents. The library provides strategies to act on objects
10//! that implement certain document traits (`NaiveDocument`, `ProcessedDocument`,
11//! `ExpandableDocument`).
12//!
13//! For more information on the strategies that were implemented, check out
14//! [Wikipedia](http://en.wikipedia.org/wiki/Tf%E2%80%93idf).
15//!
16//! # Document Types
17//!
18//! A document is defined as a collection of terms. The documents don't make
19//! assumptions about the term types (the terms are not normalized in any way).
20//!
21//! These document types are of my design. The terminology isn't standard, but
22//! they are fairly straight forward to understand.
23//!
24//! * `NaiveDocument` - A document is 'naive' if it only knows if a term is
25//! contained within it or not, but does not know HOW MANY of the instances
26//! of the term it contains.
27//!
28//! * `ProcessedDocument` - A document is 'processed' if it knows how many
29//! instances of each term is contained within it.
30//!
31//! * `ExpandableDocument` - A document is 'expandable' if provides a way to
32//! access each term contained within it.
33//!
34//! # Example
35//!
36//! The most simple way to calculate the TfIdf of a document is with the default
37//! implementation. Note, the library provides implementation of
38//! `ProcessedDocument`, for a `Vec<(T, usize)>`.
39//!
40//! ```rust
41//! use tfidf::{TfIdf, TfIdfDefault};
42//!
43//! let mut docs = Vec::new();
44//! let doc1 = vec![("a", 3), ("b", 2), ("c", 4)];
45//! let doc2 = vec![("a", 2), ("d", 5)];
46//!
47//! docs.push(doc1);
48//! docs.push(doc2);
49//!
50//! assert_eq!(0f64, TfIdfDefault::tfidf("a", &docs[0], docs.iter()));
51//! assert!(TfIdfDefault::tfidf("c", &docs[0], docs.iter()) > 0.5);
52//! ```
53//!
54//! You can also roll your own strategies to calculate tf-idf using some strategies
55//! included in the library.
56//!
57//! ```rust
58//! use tfidf::{TfIdf, ProcessedDocument};
59//! use tfidf::tf::{RawFrequencyTf};
60//! use tfidf::idf::{InverseFrequencySmoothIdf};
61//!
62//! #[derive(Copy, Clone)] struct MyTfIdfStrategy;
63//!
64//! impl<T> TfIdf<T> for MyTfIdfStrategy where T : ProcessedDocument {
65//! type Tf = RawFrequencyTf;
66//! type Idf = InverseFrequencySmoothIdf;
67//! }
68//!
69//! # let mut docs = Vec::new();
70//! # let doc1 = vec![("a", 3), ("b", 2), ("c", 4)];
71//! # let doc2 = vec![("a", 2), ("d", 5)];
72//!
73//! # docs.push(doc1);
74//! # docs.push(doc2);
75//!
76//! assert!(MyTfIdfStrategy::tfidf("a", &docs[0], docs.iter()) > 0f64);
77//! assert!(MyTfIdfStrategy::tfidf("c", &docs[0], docs.iter()) > 0f64);
78//! ```
79
80#![deny(missing_docs)]
81
82pub use prelude::{
83 Document, ExpandableDocument, NaiveDocument, NormalizationFactor, ProcessedDocument,
84 SmoothingFactor, Tf, TfIdf, Idf,
85};
86
87use std::borrow::Borrow;
88use std::collections::{BTreeMap, HashMap};
89use std::hash::Hash;
90
91mod prelude;
92
93/// Implementations of different weighting schemes for term frequency (tf).
94/// For more information about which ones are implemented, check the Wiki
95/// link in the crate description.
96pub mod tf;
97
98/// Implementations of different weighting schemes for inverse document
99/// frequency (IDF). For more information about which ones are implemented,
100/// check the Wiki link in the crate description.
101pub mod idf;
102
103/// Default scheme for calculating tf-idf.
104#[derive(Copy, Clone)]
105pub struct TfIdfDefault;
106
107impl<T> TfIdf<T> for TfIdfDefault
108where
109 T: ProcessedDocument,
110{
111 type Tf = tf::DoubleHalfNormalizationTf;
112 type Idf = idf::InverseFrequencyIdf;
113}
114
115impl<T> Document for Vec<(T, usize)> {
116 type Term = T;
117}
118
119impl<T> ProcessedDocument for Vec<(T, usize)>
120where
121 T: PartialEq,
122{
123 fn term_frequency<K>(&self, term: K) -> usize
124 where
125 K: Borrow<T>,
126 {
127 match self.iter().find(|&&(ref t, _)| t == term.borrow()) {
128 Some(&(_, c)) => c,
129 None => 0,
130 }
131 }
132
133 fn max(&self) -> Option<&T> {
134 match self.iter().max_by_key(|&&(_, c)| c) {
135 Some(&(ref t, _)) => Some(t),
136 None => None,
137 }
138 }
139}
140
141impl<T> Document for HashMap<T, usize> {
142 type Term = T;
143}
144
145impl<T> ProcessedDocument for HashMap<T, usize>
146where
147 T: Eq + Hash,
148{
149 fn term_frequency<K>(&self, term: K) -> usize
150 where
151 K: Borrow<Self::Term>,
152 {
153 if let Some(v) = self.get(term.borrow()) {
154 *v
155 } else {
156 0
157 }
158 }
159
160 fn max(&self) -> Option<&Self::Term> {
161 self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k)
162 }
163}
164
165impl<T> Document for BTreeMap<T, usize> {
166 type Term = T;
167}
168
169impl<T> ProcessedDocument for BTreeMap<T, usize>
170where
171 T: Ord,
172{
173 fn term_frequency<K>(&self, term: K) -> usize
174 where
175 K: Borrow<Self::Term>,
176 {
177 if let Some(v) = self.get(term.borrow()) {
178 *v
179 } else {
180 0
181 }
182 }
183
184 fn max(&self) -> Option<&Self::Term> {
185 self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k)
186 }
187}
188
189#[test]
190fn tfidf_wiki_example_tests() {
191 let mut docs = Vec::new();
192
193 docs.push(vec![("this", 1), ("is", 1), ("a", 2), ("sample", 1)]);
194 docs.push(vec![("this", 1), ("is", 1), ("another", 2), ("example", 3)]);
195}