tfidf/prelude.rs
1// Copyright 2016 rust-tfidf Developers
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8use std::borrow::Borrow;
9
10/// A body of terms.
11pub trait Document {
12 /// The type of term that the document consists of.
13 type Term;
14}
15
16/// A naive document with a simple function stating whether or not a
17/// term exists in the document or not. The document is naive , which
18/// means the frequencies of each term has yet to be determined. This
19/// type of document is useful for only some TF weighting schemes.
20pub trait NaiveDocument: Document {
21 /// Returns if a (non-normalized) term exists within the document.
22 fn term_exists<K>(&self, term: K) -> bool
23 where
24 K: Borrow<Self::Term>;
25}
26
27/// A document where the frequencies of each term is already calculated.
28pub trait ProcessedDocument: Document {
29 /// Returns the number of times a (non-normalized) term exists
30 /// within the document.
31 fn term_frequency<K>(&self, term: K) -> usize
32 where
33 K: Borrow<Self::Term>;
34
35 /// Returns the term with the highest frequency, or tied for the highest
36 /// frequency.
37 fn max(&self) -> Option<&Self::Term>;
38}
39
40/// A document that can be expanded to a collection of terms.
41pub trait ExpandableDocument<'a>: Document
42where
43 <Self as Document>::Term: 'a,
44{
45 /// The type of iterator that this implementor returns.
46 type TermIterator: Iterator<Item = &'a Self::Term>;
47
48 /// An iterator over the terms in the document.
49 fn terms(&self) -> Self::TermIterator;
50}
51
52impl<D, T> NaiveDocument for D
53where
54 D: ProcessedDocument<Term = T>,
55{
56 #[inline]
57 fn term_exists<K>(&self, term: K) -> bool
58 where
59 K: Borrow<T>,
60 {
61 self.term_frequency(term) > 0
62 }
63}
64
65/// A strategy to calculate a weighted or unweighted term frequency (tf)
66/// score of a term from a document.
67pub trait Tf<T>
68where
69 T: NaiveDocument,
70{
71 /// Returns the weighted or unweighted term frequency (tf) for a single
72 /// term within a document.
73 fn tf<K>(term: K, doc: &T) -> f64
74 where
75 K: Borrow<T::Term>;
76}
77
78/// A strategy to calculate a weighted or unweighted inverse document frequency
79/// (idf) for a single term within a corpus of documents.
80pub trait Idf<T>
81where
82 T: NaiveDocument,
83{
84 /// Returns the weighted or unweighted inverse document frequency (idf)
85 /// for a single term within a corpus of documents.
86 fn idf<'a, I, K>(term: K, docs: I) -> f64
87 where
88 I: Iterator<Item = &'a T>,
89 K: Borrow<T::Term>,
90 T: 'a;
91}
92
93/// A strategy that uses a normalization factor.
94pub trait NormalizationFactor {
95 /// Returns a normalization factor.
96 fn factor() -> f64;
97}
98
99/// A strategy that uses a smoothing factor.
100pub trait SmoothingFactor {
101 /// Returns a smoothing factor.
102 fn factor() -> f64;
103}
104
105/// Trait to create a strategy to calculate a tf-idf.
106pub trait TfIdf<T>
107where
108 T: NaiveDocument,
109{
110 /// The tf weighting scheme.
111 type Tf: Tf<T>;
112
113 /// The idf weighting scheme.
114 type Idf: Idf<T>;
115
116 /// Calculates the tf-idf using the two weighting schemes chosen.
117 fn tfidf<'a, K, I>(term: K, doc: &T, docs: I) -> f64
118 where
119 I: Iterator<Item = &'a T>,
120 K: Borrow<T::Term>,
121 T: 'a,
122 {
123 Self::Tf::tf(term.borrow(), doc) * Self::Idf::idf(term.borrow(), docs)
124 }
125}