1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195
// Copyright 2016 rust-tfidf Developers // // Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or // http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or // http://opensource.org/licenses/MIT>, at your option. This file may not be // copied, modified, or distributed except according to those terms. //! Library to calculate TF-IDF (Term Frequency - Inverse Document Frequency) //! for generic documents. The library provides strategies to act on objects //! that implement certain document traits (`NaiveDocument`, `ProcessedDocument`, //! `ExpandableDocument`). //! //! For more information on the strategies that were implemented, check out //! [Wikipedia](http://en.wikipedia.org/wiki/Tf%E2%80%93idf). //! //! # Document Types //! //! A document is defined as a collection of terms. The documents don't make //! assumptions about the term types (the terms are not normalized in any way). //! //! These document types are of my design. The terminology isn't standard, but //! they are fairly straight forward to understand. //! //! * `NaiveDocument` - A document is 'naive' if it only knows if a term is //! contained within it or not, but does not know HOW MANY of the instances //! of the term it contains. //! //! * `ProcessedDocument` - A document is 'processed' if it knows how many //! instances of each term is contained within it. //! //! * `ExpandableDocument` - A document is 'expandable' if provides a way to //! access each term contained within it. //! //! # Example //! //! The most simple way to calculate the TfIdf of a document is with the default //! implementation. Note, the library provides implementation of //! `ProcessedDocument`, for a `Vec<(T, usize)>`. //! //! ```rust //! use tfidf::{TfIdf, TfIdfDefault}; //! //! let mut docs = Vec::new(); //! let doc1 = vec![("a", 3), ("b", 2), ("c", 4)]; //! let doc2 = vec![("a", 2), ("d", 5)]; //! //! docs.push(doc1); //! docs.push(doc2); //! //! assert_eq!(0f64, TfIdfDefault::tfidf("a", &docs[0], docs.iter())); //! assert!(TfIdfDefault::tfidf("c", &docs[0], docs.iter()) > 0.5); //! ``` //! //! You can also roll your own strategies to calculate tf-idf using some strategies //! included in the library. //! //! ```rust //! use tfidf::{TfIdf, ProcessedDocument}; //! use tfidf::tf::{RawFrequencyTf}; //! use tfidf::idf::{InverseFrequencySmoothIdf}; //! //! #[derive(Copy, Clone)] struct MyTfIdfStrategy; //! //! impl<T> TfIdf<T> for MyTfIdfStrategy where T : ProcessedDocument { //! type Tf = RawFrequencyTf; //! type Idf = InverseFrequencySmoothIdf; //! } //! //! # let mut docs = Vec::new(); //! # let doc1 = vec![("a", 3), ("b", 2), ("c", 4)]; //! # let doc2 = vec![("a", 2), ("d", 5)]; //! //! # docs.push(doc1); //! # docs.push(doc2); //! //! assert!(MyTfIdfStrategy::tfidf("a", &docs[0], docs.iter()) > 0f64); //! assert!(MyTfIdfStrategy::tfidf("c", &docs[0], docs.iter()) > 0f64); //! ``` #![deny(missing_docs)] pub use prelude::{ Document, ExpandableDocument, NaiveDocument, NormalizationFactor, ProcessedDocument, SmoothingFactor, Tf, TfIdf, Idf, }; use std::borrow::Borrow; use std::collections::{BTreeMap, HashMap}; use std::hash::Hash; mod prelude; /// Implementations of different weighting schemes for term frequency (tf). /// For more information about which ones are implemented, check the Wiki /// link in the crate description. pub mod tf; /// Implementations of different weighting schemes for inverse document /// frequency (IDF). For more information about which ones are implemented, /// check the Wiki link in the crate description. pub mod idf; /// Default scheme for calculating tf-idf. #[derive(Copy, Clone)] pub struct TfIdfDefault; impl<T> TfIdf<T> for TfIdfDefault where T: ProcessedDocument, { type Tf = tf::DoubleHalfNormalizationTf; type Idf = idf::InverseFrequencyIdf; } impl<T> Document for Vec<(T, usize)> { type Term = T; } impl<T> ProcessedDocument for Vec<(T, usize)> where T: PartialEq, { fn term_frequency<K>(&self, term: K) -> usize where K: Borrow<T>, { match self.iter().find(|&&(ref t, _)| t == term.borrow()) { Some(&(_, c)) => c, None => 0, } } fn max(&self) -> Option<&T> { match self.iter().max_by_key(|&&(_, c)| c) { Some(&(ref t, _)) => Some(t), None => None, } } } impl<T> Document for HashMap<T, usize> { type Term = T; } impl<T> ProcessedDocument for HashMap<T, usize> where T: Eq + Hash, { fn term_frequency<K>(&self, term: K) -> usize where K: Borrow<Self::Term>, { if let Some(v) = self.get(term.borrow()) { *v } else { 0 } } fn max(&self) -> Option<&Self::Term> { self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k) } } impl<T> Document for BTreeMap<T, usize> { type Term = T; } impl<T> ProcessedDocument for BTreeMap<T, usize> where T: Ord, { fn term_frequency<K>(&self, term: K) -> usize where K: Borrow<Self::Term>, { if let Some(v) = self.get(term.borrow()) { *v } else { 0 } } fn max(&self) -> Option<&Self::Term> { self.iter().max_by_key(|(_k, v)| *v).map(|(k, _v)| k) } } #[test] fn tfidf_wiki_example_tests() { let mut docs = Vec::new(); docs.push(vec![("this", 1), ("is", 1), ("a", 2), ("sample", 1)]); docs.push(vec![("this", 1), ("is", 1), ("another", 2), ("example", 3)]); }