tfidf/
prelude.rs

1// Copyright 2016 rust-tfidf Developers
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8use std::borrow::Borrow;
9
10/// A body of terms.
11pub trait Document {
12  /// The type of term that the document consists of.
13  type Term;
14}
15
16/// A naive document with a simple function stating whether or not a
17/// term exists in the document or not. The document is naive , which
18/// means the frequencies of each term has yet to be determined. This
19/// type of document is useful for only some TF weighting schemes.
20pub trait NaiveDocument: Document {
21  /// Returns if a (non-normalized) term exists within the document.
22  fn term_exists<K>(&self, term: K) -> bool
23  where
24    K: Borrow<Self::Term>;
25}
26
27/// A document where the frequencies of each term is already calculated.
28pub trait ProcessedDocument: Document {
29  /// Returns the number of times a (non-normalized) term exists
30  /// within the document.
31  fn term_frequency<K>(&self, term: K) -> usize
32  where
33    K: Borrow<Self::Term>;
34
35  /// Returns the term with the highest frequency, or tied for the highest
36  /// frequency.
37  fn max(&self) -> Option<&Self::Term>;
38}
39
40/// A document that can be expanded to a collection of terms.
41pub trait ExpandableDocument<'a>: Document
42where
43  <Self as Document>::Term: 'a,
44{
45  /// The type of iterator that this implementor returns.
46  type TermIterator: Iterator<Item = &'a Self::Term>;
47
48  /// An iterator over the terms in the document.
49  fn terms(&self) -> Self::TermIterator;
50}
51
52impl<D, T> NaiveDocument for D
53where
54  D: ProcessedDocument<Term = T>,
55{
56  #[inline]
57  fn term_exists<K>(&self, term: K) -> bool
58  where
59    K: Borrow<T>,
60  {
61    self.term_frequency(term) > 0
62  }
63}
64
65/// A strategy to calculate a weighted or unweighted term frequency (tf)
66/// score of a term from a document.
67pub trait Tf<T>
68where
69  T: NaiveDocument,
70{
71  /// Returns the weighted or unweighted term frequency (tf) for a single
72  /// term within a document.
73  fn tf<K>(term: K, doc: &T) -> f64
74  where
75    K: Borrow<T::Term>;
76}
77
78/// A strategy to calculate a weighted or unweighted inverse document frequency
79/// (idf) for a single term within a corpus of documents.
80pub trait Idf<T>
81where
82  T: NaiveDocument,
83{
84  /// Returns the weighted or unweighted inverse document frequency (idf)
85  /// for a single term within a corpus of documents.
86  fn idf<'a, I, K>(term: K, docs: I) -> f64
87  where
88    I: Iterator<Item = &'a T>,
89    K: Borrow<T::Term>,
90    T: 'a;
91}
92
93/// A strategy that uses a normalization factor.
94pub trait NormalizationFactor {
95  /// Returns a normalization factor.
96  fn factor() -> f64;
97}
98
99/// A strategy that uses a smoothing factor.
100pub trait SmoothingFactor {
101  /// Returns a smoothing factor.
102  fn factor() -> f64;
103}
104
105/// Trait to create a strategy to calculate a tf-idf.
106pub trait TfIdf<T>
107where
108  T: NaiveDocument,
109{
110  /// The tf weighting scheme.
111  type Tf: Tf<T>;
112
113  /// The idf weighting scheme.
114  type Idf: Idf<T>;
115
116  /// Calculates the tf-idf using the two weighting schemes chosen.
117  fn tfidf<'a, K, I>(term: K, doc: &T, docs: I) -> f64
118  where
119    I: Iterator<Item = &'a T>,
120    K: Borrow<T::Term>,
121    T: 'a,
122  {
123    Self::Tf::tf(term.borrow(), doc) * Self::Idf::idf(term.borrow(), docs)
124  }
125}