ducky_learn/feature_extraction.rs
1use std::collections::HashSet;
2
3/// Struct for converting a collection of text documents to a matrix of token counts.
4/// This implementation produces a sparse representation of the counts using a Vector.
5///
6/// # Fields
7/// `feature_names`: A vector storing the unique words found across all documents.
8/// These are the 'features' that the model has learned.
9///
10/// # Examples
11///
12/// ```
13/// use ducky_learn::feature_extraction::CountVectorizer;
14///
15/// let mut count_vector = CountVectorizer::new();
16/// let document = vec![
17/// "hello this is a test".to_string(),
18/// "this is another test".to_string(),
19/// ];
20/// count_vector.fit_transform(&document);
21/// assert_eq!(count_vector.feature_names, vec!["hello", "this", "is", "a", "test", "another"]);
22/// ```
23pub struct CountVectorizer {
24 pub feature_names: Vec<String>,
25}
26
27impl CountVectorizer {
28 /// Creates a new instance of `CountVectorizer` with an empty list of feature names.
29 ///
30 /// # Returns
31 /// A new instance of `CountVectorizer`.
32 ///
33 /// # Examples
34 ///
35 /// ```
36 /// use ducky_learn::feature_extraction::CountVectorizer;
37 ///
38 /// let count_vector = CountVectorizer::new();
39 /// assert_eq!(count_vector.feature_names, Vec::<String>::new());
40 /// ```
41 pub fn new() -> Self {
42 Self {
43 feature_names: Vec::new(),
44 }
45 }
46
47 /// Fits the model according to the given training data and
48 /// then transforms the data into a matrix of token counts.
49 ///
50 /// This process involves learning the 'vocabulary' from the input data (i.e.,
51 /// all unique words across all documents) and then representing each document
52 /// as a vector of counts of the words in the learned vocabulary.
53 ///
54 /// # Arguments
55 /// * `input_document` - A vector of strings where each string represents a document.
56 ///
57 /// # Returns
58 /// A vector of vectors, where each inner vector represents a document and contains
59 /// the token counts for each word in the learned vocabulary.
60 ///
61 /// # Examples
62 ///
63 /// ```
64 /// use ducky_learn::feature_extraction::CountVectorizer;
65 ///
66 /// let mut count_vector = CountVectorizer::new();
67 /// let document = vec![
68 /// "hello this is a test".to_string(),
69 /// "this is another test".to_string(),
70 /// ];
71 /// let transformed_document = count_vector.fit_transform(&document);
72 /// assert_eq!(transformed_document, vec![
73 /// vec![1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
74 /// vec![0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
75 /// ]);
76 /// ```
77 pub fn fit_transform(&mut self, input_document: &Vec<String>) -> Vec<Vec<f64>> {
78 // Adds words to the feature_names
79 for sentence in input_document {
80 for word in sentence.split(" ") {
81 let word = word.to_string();
82 if !self.feature_names.contains(&word) {
83 self.feature_names.push(word);
84 }
85 }
86 }
87
88 self.transform(input_document)
89 }
90
91 /// Transforms the data into a matrix of token counts using the learned vocabulary.
92 ///
93 /// This process involves representing each document as a vector of counts of the
94 /// words in the learned vocabulary. Note that this method does not learn the vocabulary
95 /// and assumes that `fit_transform` has already been called.
96 ///
97 /// # Arguments
98 /// * `input_document` - A vector of strings where each string represents a document.
99 ///
100 /// # Returns
101 /// A vector of vectors, where each inner vector represents a document and contains
102 /// the token counts for each word in the learned vocabulary.
103 ///
104 /// # Examples
105 ///
106 /// ```
107 /// use ducky_learn::feature_extraction::CountVectorizer;
108 ///
109 /// let mut count_vector = CountVectorizer::new();
110 /// let document = vec![
111 /// "hello this is a test".to_string(),
112 /// "this is another test".to_string(),
113 /// ];
114 /// count_vector.fit_transform(&document);
115 /// let new_document = vec![
116 /// "this another test".to_string(),
117 /// ];
118 /// let transformed_new_document = count_vector.transform(&new_document);
119 /// assert_eq!(transformed_new_document, vec![
120 /// vec![0.0, 1.0, 0.0, 0.0, 1.0, 1.0],
121 /// ]);
122 /// ```
123 pub fn transform(&self, input_document: &Vec<String>) -> Vec<Vec<f64>> {
124 let mut count_vector: Vec<Vec<f64>> = Vec::with_capacity(input_document.len());
125
126 for (idx, sentence) in input_document.iter().enumerate() {
127 count_vector.push(zeros(self.feature_names.len()));
128 for word in sentence.split(" ") {
129 let word = word.to_string();
130 let position_of_word = self.feature_names.iter().position(|x| x == &word).unwrap();
131 count_vector[idx][position_of_word] += 1f64;
132 }
133 }
134
135 count_vector
136 }
137}
138
139/// Helper function that creates a new vector filled with zeros.
140///
141/// # Arguments
142/// * `size` - The desired size of the vector.
143///
144/// # Returns
145/// A new vector of the given size, filled with zeros.
146fn zeros(size: usize) -> Vec<f64> {
147 let mut zero_vec: Vec<f64> = Vec::with_capacity(size);
148 for i in 0..size {
149 zero_vec.push(0.0);
150 }
151 return zero_vec;
152}
153
154#[cfg(test)]
155mod feature_extraction_tests {
156 use super::*;
157
158 #[test]
159 fn test_count_vector_fit_transform() {
160 let mut count_vector = CountVectorizer::new();
161
162 let document = vec![
163 "hello this is ducky duck".to_string(),
164 "chris don't wear wigs".to_string(),
165 "ducks taste nice".to_string(),
166 "duck duck goose".to_string(),
167 ];
168
169 let transformed_doc = vec![
170 vec![
171 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
172 ],
173 vec![
174 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
175 ],
176 vec![
177 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0,
178 ],
179 vec![
180 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
181 ],
182 ];
183
184 let feature_names = vec![
185 "hello".to_string(),
186 "this".to_string(),
187 "is".to_string(),
188 "ducky".to_string(),
189 "duck".to_string(),
190 "chris".to_string(),
191 "don't".to_string(),
192 "wear".to_string(),
193 "wigs".to_string(),
194 "ducks".to_string(),
195 "taste".to_string(),
196 "nice".to_string(),
197 "goose".to_string(),
198 ];
199
200 assert_eq!(count_vector.fit_transform(&document), transformed_doc);
201 assert_eq!(count_vector.feature_names, feature_names)
202 }
203
204 #[test]
205 fn test_empty_string() {
206 let mut count_vector = CountVectorizer::new();
207
208 let document = vec!["".to_string()];
209
210 let transformed_doc: Vec<Vec<f64>> = vec![vec![1.0]];
211
212 let feature_names: Vec<String> = vec!["".to_string()];
213
214 assert_eq!(count_vector.fit_transform(&document), transformed_doc);
215 assert_eq!(count_vector.feature_names, feature_names)
216 }
217}