tfidf/
tf.rs

1// Copyright 2016 rust-tfidf Developers
2//
3// Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
4// http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
5// http://opensource.org/licenses/MIT>, at your option. This file may not be
6// copied, modified, or distributed except according to those terms.
7
8use std::borrow::Borrow;
9
10use prelude::{NaiveDocument, NormalizationFactor, ProcessedDocument, Tf};
11
12/// Binary weighting scheme for TF. If the document contains the term, returns 1,
13/// otherwise returns 0.
14#[derive(Copy, Clone)]
15pub struct BinaryTf;
16
17impl<T> Tf<T> for BinaryTf
18where
19  T: NaiveDocument,
20{
21  #[inline]
22  fn tf<K>(term: K, doc: &T) -> f64
23  where
24    K: Borrow<T::Term>,
25  {
26    if doc.term_exists(term) {
27      1f64
28    } else {
29      0f64
30    }
31  }
32}
33
34/// Raw frequency weighting scheme for TF. Returns the number of times a term occurs
35/// in the document.
36#[derive(Copy, Clone)]
37pub struct RawFrequencyTf(f64);
38
39impl<T> Tf<T> for RawFrequencyTf
40where
41  T: ProcessedDocument,
42{
43  #[inline]
44  fn tf<K>(term: K, doc: &T) -> f64
45  where
46    K: Borrow<T::Term>,
47  {
48    doc.term_frequency(term) as f64
49  }
50}
51
52/// Log normalized weighting scheme for TF. Computes `1 + log (f)` where `f` is the
53/// frequency of the term in the document.
54#[derive(Copy, Clone)]
55pub struct LogNormalizationTf;
56
57impl<T> Tf<T> for LogNormalizationTf
58where
59  T: ProcessedDocument,
60{
61  #[inline]
62  fn tf<K>(term: K, doc: &T) -> f64
63  where
64    K: Borrow<T::Term>,
65  {
66    1f64 + (doc.term_frequency(term) as f64).ln()
67  }
68}
69
70/// Double normalized weighting scheme for TF based on a factor, `K`.
71///
72/// # Example
73///
74/// To implement a custom Tf strategy, where the `K` factor is constant:
75///
76/// ```rust
77/// use tfidf::{Tf, NormalizationFactor};
78/// use tfidf::tf::{DoubleKNormalizationTf};
79///
80/// struct DoubleThirdNormalizationTf;
81///
82/// impl NormalizationFactor for DoubleThirdNormalizationTf {
83///   fn factor() -> f64 { 0.3f64 }
84/// }
85///
86/// impl DoubleKNormalizationTf for DoubleThirdNormalizationTf { }
87/// ```
88pub trait DoubleKNormalizationTf: NormalizationFactor {}
89
90impl<T, S> Tf<T> for S
91where
92  S: DoubleKNormalizationTf,
93  T: ProcessedDocument,
94{
95  #[inline]
96  fn tf<K>(term: K, doc: &T) -> f64
97  where
98    K: Borrow<T::Term>,
99  {
100    let max = match doc.max() {
101      Some(m) => doc.term_frequency(m) as f64,
102      None => 1f64,
103    };
104
105    // K + ((1 - K) * (f / max f))
106    S::factor() + ((1f64 - S::factor()) * ((doc.term_frequency(term) as f64) / max))
107  }
108}
109
110/// Double normalized weighting scheme for TF based on a factor, `K = 0.5`.
111#[derive(Copy, Clone)]
112pub struct DoubleHalfNormalizationTf;
113
114impl NormalizationFactor for DoubleHalfNormalizationTf {
115  #[inline]
116  fn factor() -> f64 {
117    0.5f64
118  }
119}
120
121impl DoubleKNormalizationTf for DoubleHalfNormalizationTf {}