1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
// Copyright 2016 rust-tfidf Developers
//
// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
// http://opensource.org/licenses/MIT>, at your option. This file may not be
// copied, modified, or distributed except according to those terms.

//! Library to calculate TF-IDF (Term Frequency - Inverse Document Frequency)
//! for generic documents. The library provides strategies to act on objects
//! that implement certain document traits (`NaiveDocument`, `ProcessedDocument`,
//! `ExpandableDocument`).
//!
//! For more information on the strategies that were implemented, check out
//! [Wikipedia](http://en.wikipedia.org/wiki/Tf%E2%80%93idf).
//!
//! # Document Types
//!
//! A document is defined as a collection of terms. The documents don't make
//! assumptions about the term types (the terms are not normalized in any way).
//!
//! These document types are of my design. The terminology isn't standard, but
//! they are fairly straight forward to understand.
//!
//!   * `NaiveDocument` - A document is 'naive' if it only knows if a term is
//!     contained within it or not, but does not know HOW MANY of the instances
//!     of the term it contains.
//!
//!   * `ProcessedDocument` - A document is 'processed' if it knows how many
//!     instances of each term is contained within it.
//!
//!   * `ExpandableDocument` - A document is 'expandable' if provides a way to
//!     access each term contained within it.
//!
//! # Example
//!
//! The most simple way to calculate the TfIdf of a document is with the default
//! implementation. Note, the library provides implementation of
//! `ProcessedDocument`, for a `Vec<(T, usize)>`.
//!
//! ```rust
//! use tfidf::{TfIdf, TfIdfDefault};
//!
//! let mut docs = Vec::new();
//! let doc1 = vec![("a", 3), ("b", 2), ("c", 4)];
//! let doc2 = vec![("a", 2), ("d", 5)];
//!
//! docs.push(doc1);
//! docs.push(doc2);
//!
//! assert_eq!(0f64, TfIdfDefault::tfidf("a", &docs[0], docs.iter()));
//! assert!(TfIdfDefault::tfidf("c", &docs[0], docs.iter()) > 0.5);
//! ```
//!
//! You can also roll your own strategies to calculate tf-idf using some strategies
//! included in the library.
//!
//! ```rust
//! use tfidf::{TfIdf, ProcessedDocument};
//! use tfidf::tf::{RawFrequencyTf};
//! use tfidf::idf::{InverseFrequencySmoothIdf};
//!
//! #[derive(Copy, Clone)] struct MyTfIdfStrategy;
//!
//! impl<T> TfIdf<T> for MyTfIdfStrategy where T : ProcessedDocument {
//!   type Tf = RawFrequencyTf;
//!   type Idf = InverseFrequencySmoothIdf;
//! }
//!
//! # let mut docs = Vec::new();
//! # let doc1 = vec![("a", 3), ("b", 2), ("c", 4)];
//! # let doc2 = vec![("a", 2), ("d", 5)];
//!
//! # docs.push(doc1);
//! # docs.push(doc2);
//!
//! assert!(MyTfIdfStrategy::tfidf("a", &docs[0], docs.iter()) > 0f64);
//! assert!(MyTfIdfStrategy::tfidf("c", &docs[0], docs.iter()) > 0f64);
//! ```

#![deny(missing_docs)]

pub use prelude::{
  Document, ExpandableDocument, NaiveDocument, NormalizationFactor, ProcessedDocument,
  SmoothingFactor, Tf, TfIdf, Idf,
};

use std::borrow::Borrow;
use std::collections::{BTreeMap, HashMap};
use std::hash::Hash;

mod prelude;

/// Implementations of different weighting schemes for term frequency (tf).
/// For more information about which ones are implemented, check the Wiki
/// link in the crate description.
pub mod tf;

/// Implementations of different weighting schemes for inverse document
/// frequency (IDF). For more information about which ones are implemented,
/// check the Wiki link in the crate description.
pub mod idf;

/// Default scheme for calculating tf-idf.
#[derive(Copy, Clone)]
pub struct TfIdfDefault;

impl<T> TfIdf<T> for TfIdfDefault
where
  T: ProcessedDocument,
{
  type Tf = tf::DoubleHalfNormalizationTf;
  type Idf = idf::InverseFrequencyIdf;
}

impl<T> Document for Vec<(T, usize)> {
  type Term = T;
}

impl<T> ProcessedDocument for Vec<(T, usize)>
where
  T: PartialEq,
{
  fn term_frequency<K>(&self, term: K) -> usize
  where
    K: Borrow<T>,
  {
    match self.iter().find(|&&(ref t, _)| t == term.borrow()) {
      Some(&(_, c)) => c,
      None => 0,
    }
  }

  fn max(&self) -> Option<&T> {
    match self.iter().max_by_key(|&&(_, c)| c) {
      Some(&(ref t, _)) => Some(t),
      None => None,
    }
  }
}

impl<T> Document for HashMap<T, usize> {
  type Term = T;
}

impl<T> ProcessedDocument for HashMap<T, usize>
where
  T: Eq + Hash,
{
  fn term_frequency<K>(&self, term: K) -> usize
  where
    K: Borrow<Self::Term>,
  {
    if let Some(v) = self.get(term.borrow()) {
      *v
    } else {
      0
    }
  }

  fn max(&self) -> Option<&Self::Term> {
    self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k)
  }
}

impl<T> Document for BTreeMap<T, usize> {
  type Term = T;
}

impl<T> ProcessedDocument for BTreeMap<T, usize>
where
  T: Ord,
{
  fn term_frequency<K>(&self, term: K) -> usize
  where
    K: Borrow<Self::Term>,
  {
    if let Some(v) = self.get(term.borrow()) {
      *v
    } else {
      0
    }
  }

  fn max(&self) -> Option<&Self::Term> {
    self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k)
  }
}

#[test]
fn tfidf_wiki_example_tests() {
  let mut docs = Vec::new();

  docs.push(vec![("this", 1), ("is", 1), ("a", 2), ("sample", 1)]);
  docs.push(vec![("this", 1), ("is", 1), ("another", 2), ("example", 3)]);
}