ducky_learn/
feature_extraction.rs

1use std::collections::HashSet;
2
3/// Struct for converting a collection of text documents to a matrix of token counts.
4/// This implementation produces a sparse representation of the counts using a Vector.
5///
6/// # Fields
7/// `feature_names`: A vector storing the unique words found across all documents.
8///     These are the 'features' that the model has learned.
9///
10/// # Examples
11///
12/// ```
13/// use ducky_learn::feature_extraction::CountVectorizer;
14///
15/// let mut count_vector = CountVectorizer::new();
16/// let document = vec![
17///     "hello this is a test".to_string(),
18///     "this is another test".to_string(),
19/// ];
20/// count_vector.fit_transform(&document);
21/// assert_eq!(count_vector.feature_names, vec!["hello", "this", "is", "a", "test", "another"]);
22/// ```
23pub struct CountVectorizer {
24    pub feature_names: Vec<String>,
25}
26
27impl CountVectorizer {
28    /// Creates a new instance of `CountVectorizer` with an empty list of feature names.
29    ///
30    /// # Returns
31    /// A new instance of `CountVectorizer`.
32    ///
33    /// # Examples
34    ///
35    /// ```
36    /// use ducky_learn::feature_extraction::CountVectorizer;
37    ///
38    /// let count_vector = CountVectorizer::new();
39    /// assert_eq!(count_vector.feature_names, Vec::<String>::new());
40    /// ```
41    pub fn new() -> Self {
42        Self {
43            feature_names: Vec::new(),
44        }
45    }
46
47    /// Fits the model according to the given training data and
48    /// then transforms the data into a matrix of token counts.
49    ///
50    /// This process involves learning the 'vocabulary' from the input data (i.e.,
51    /// all unique words across all documents) and then representing each document
52    /// as a vector of counts of the words in the learned vocabulary.
53    ///
54    /// # Arguments
55    /// * `input_document` - A vector of strings where each string represents a document.
56    ///
57    /// # Returns
58    /// A vector of vectors, where each inner vector represents a document and contains
59    /// the token counts for each word in the learned vocabulary.
60    ///
61    /// # Examples
62    ///
63    /// ```
64    /// use ducky_learn::feature_extraction::CountVectorizer;
65    ///
66    /// let mut count_vector = CountVectorizer::new();
67    /// let document = vec![
68    ///     "hello this is a test".to_string(),
69    ///     "this is another test".to_string(),
70    /// ];
71    /// let transformed_document = count_vector.fit_transform(&document);
72    /// assert_eq!(transformed_document, vec![
73    ///     vec![1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
74    ///     vec![0.0, 1.0, 1.0, 0.0, 1.0, 1.0],
75    /// ]);
76    /// ```
77    pub fn fit_transform(&mut self, input_document: &Vec<String>) -> Vec<Vec<f64>> {
78        // Adds words to the feature_names
79        for sentence in input_document {
80            for word in sentence.split(" ") {
81                let word = word.to_string();
82                if !self.feature_names.contains(&word) {
83                    self.feature_names.push(word);
84                }
85            }
86        }
87
88        self.transform(input_document)
89    }
90
91    /// Transforms the data into a matrix of token counts using the learned vocabulary.
92    ///
93    /// This process involves representing each document as a vector of counts of the
94    /// words in the learned vocabulary. Note that this method does not learn the vocabulary
95    /// and assumes that `fit_transform` has already been called.
96    ///
97    /// # Arguments
98    /// * `input_document` - A vector of strings where each string represents a document.
99    ///
100    /// # Returns
101    /// A vector of vectors, where each inner vector represents a document and contains
102    /// the token counts for each word in the learned vocabulary.
103    ///
104    /// # Examples
105    ///
106    /// ```
107    /// use ducky_learn::feature_extraction::CountVectorizer;
108    ///
109    /// let mut count_vector = CountVectorizer::new();
110    /// let document = vec![
111    ///     "hello this is a test".to_string(),
112    ///     "this is another test".to_string(),
113    /// ];
114    /// count_vector.fit_transform(&document);
115    /// let new_document = vec![
116    ///     "this another test".to_string(),
117    /// ];
118    /// let transformed_new_document = count_vector.transform(&new_document);
119    /// assert_eq!(transformed_new_document, vec![
120    ///     vec![0.0, 1.0, 0.0, 0.0, 1.0, 1.0],
121    /// ]);
122    /// ```
123    pub fn transform(&self, input_document: &Vec<String>) -> Vec<Vec<f64>> {
124        let mut count_vector: Vec<Vec<f64>> = Vec::with_capacity(input_document.len());
125
126        for (idx, sentence) in input_document.iter().enumerate() {
127            count_vector.push(zeros(self.feature_names.len()));
128            for word in sentence.split(" ") {
129                let word = word.to_string();
130                let position_of_word = self.feature_names.iter().position(|x| x == &word).unwrap();
131                count_vector[idx][position_of_word] += 1f64;
132            }
133        }
134
135        count_vector
136    }
137}
138
139/// Helper function that creates a new vector filled with zeros.
140///
141/// # Arguments
142/// * `size` - The desired size of the vector.
143///
144/// # Returns
145/// A new vector of the given size, filled with zeros.
146fn zeros(size: usize) -> Vec<f64> {
147    let mut zero_vec: Vec<f64> = Vec::with_capacity(size);
148    for i in 0..size {
149        zero_vec.push(0.0);
150    }
151    return zero_vec;
152}
153
154#[cfg(test)]
155mod feature_extraction_tests {
156    use super::*;
157
158    #[test]
159    fn test_count_vector_fit_transform() {
160        let mut count_vector = CountVectorizer::new();
161
162        let document = vec![
163            "hello this is ducky duck".to_string(),
164            "chris don't wear wigs".to_string(),
165            "ducks taste nice".to_string(),
166            "duck duck goose".to_string(),
167        ];
168
169        let transformed_doc = vec![
170            vec![
171                1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
172            ],
173            vec![
174                0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
175            ],
176            vec![
177                0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0,
178            ],
179            vec![
180                0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
181            ],
182        ];
183
184        let feature_names = vec![
185            "hello".to_string(),
186            "this".to_string(),
187            "is".to_string(),
188            "ducky".to_string(),
189            "duck".to_string(),
190            "chris".to_string(),
191            "don't".to_string(),
192            "wear".to_string(),
193            "wigs".to_string(),
194            "ducks".to_string(),
195            "taste".to_string(),
196            "nice".to_string(),
197            "goose".to_string(),
198        ];
199
200        assert_eq!(count_vector.fit_transform(&document), transformed_doc);
201        assert_eq!(count_vector.feature_names, feature_names)
202    }
203
204    #[test]
205    fn test_empty_string() {
206        let mut count_vector = CountVectorizer::new();
207
208        let document = vec!["".to_string()];
209
210        let transformed_doc: Vec<Vec<f64>> = vec![vec![1.0]];
211
212        let feature_names: Vec<String> = vec!["".to_string()];
213
214        assert_eq!(count_vector.fit_transform(&document), transformed_doc);
215        assert_eq!(count_vector.feature_names, feature_names)
216    }
217}