recoreco/
stats.rs

1//! ## Mapping between original string identifiers and internal indexes
2//!
3//! Many interaction datasets contain string identifiers for users and items. Internally however,
4//! we want to internally work with consecutive integer ids for memory efficiency. We therefore
5//! keep track of the string identifiers of users and items as well as the overall number of
6//! interactions in order to map back and forth between the two representations.
7//!
8/**
9 * RecoReco
10 * Copyright (C) 2018 Sebastian Schelter
11 *
12 * This program is free software: you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation, either version 3 of the License, or
15 * (at your option) any later version.
16 *
17 * This program is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 * GNU General Public License for more details.
21 *
22 * You should have received a copy of the GNU General Public License
23 * along with this program. If not, see <http://www.gnu.org/licenses/>.
24 */
25
26extern crate fnv;
27extern crate csv;
28
29use fnv::FnvHashMap;
30
31/// Mapping from original string based identifiers to internal `u32` indexes.
32pub struct DataDictionary {
33    user_dict: FnvHashMap<String, u32>,
34    item_dict: FnvHashMap<String, u32>,
35    num_interactions: u64,
36}
37
38impl DataDictionary {
39
40    /// Returns the overall number of users in the dataset.
41    pub fn num_users(&self) -> usize {
42        self.user_dict.len()
43    }
44
45    /// Returns the overall number of items in the dataset.
46    pub fn num_items(&self) -> usize {
47        self.item_dict.len()
48    }
49
50    /// Returns the overall number of interactions in the dataset.
51    pub fn num_interactions(&self) -> u64 {
52        self.num_interactions
53    }
54
55    /// Returns the internal index for the user with the string identifier `name`
56    pub fn user_index(&self, name: &str) -> &u32 {
57        &self.user_dict[name]
58    }
59
60    /// Returns the internal index for the item with the string identifier `name`
61    pub fn item_index(&self, name: &str) -> &u32 {
62        &self.item_dict[name]
63    }
64
65    /// Builds up a `DataDictionary` by consuming an iterator over string tuples representing
66    /// user-item interactions. We assume that the first string in the tuple identifies a user and
67    /// the second string identifies an item
68    pub fn from_owned<T>(interactions: T) -> Self
69    where
70        T: Iterator<Item = (String, String)>
71    {
72        let mut user_index: u32 = 0;
73        let mut user_dict: FnvHashMap<String, u32> = FnvHashMap::default();
74
75        let mut item_index: u32 = 0;
76        let mut item_dict: FnvHashMap<String, u32> = FnvHashMap::default();
77
78        let mut num_interactions: u64 = 0;
79
80        for (user, item) in interactions {
81
82            user_dict.entry(user).or_insert_with(|| {
83                let current_user_index = user_index;
84                user_index += 1;
85                current_user_index
86            });
87
88            item_dict.entry(item).or_insert_with(|| {
89                let current_item_index = item_index;
90                item_index += 1;
91                current_item_index
92            });
93
94            num_interactions += 1;
95        }
96
97        DataDictionary { user_dict, item_dict, num_interactions }
98    }
99
100    /// Builds up a `DataDictionary` by reading an iterator over references to string tuples
101    /// representing user-item interactions. We assume that the first string in the tuple
102    /// identifies a user and the second string identifies an item
103    pub fn from<'a,T>(interactions: T) -> DataDictionary
104    where
105        T: Iterator<Item = &'a(String, String)>
106    {
107
108        let owned = interactions
109            .map(|(user, item)| (user.to_owned(), item.to_owned()));
110
111        DataDictionary::from_owned(owned)
112    }
113}
114
115/// Builds up a `DataDictionary` by reading an iterator over string tuples representing
116/// user-item interactions. We assume that the first string in the tuple identifies a user and
117/// the second string identifies an item
118impl <T> From<T> for DataDictionary
119where
120    T: Iterator<Item = (String, String)>
121{
122    fn from(iter: T) -> Self {
123        let mut user_index: u32 = 0;
124        let mut user_dict: FnvHashMap<String, u32> = FnvHashMap::default();
125
126        let mut item_index: u32 = 0;
127        let mut item_dict: FnvHashMap<String, u32> = FnvHashMap::default();
128
129        let mut num_interactions: u64 = 0;
130
131        for (user, item) in iter {
132
133            user_dict.entry(user).or_insert_with(|| {
134                let current_user_index = user_index;
135                user_index += 1;
136                current_user_index
137            });
138
139            item_dict.entry(item).or_insert_with(|| {
140                let current_item_index = item_index;
141                item_index += 1;
142                current_item_index
143            });
144
145            num_interactions += 1;
146        }
147
148        DataDictionary { user_dict, item_dict, num_interactions }
149    }
150}
151
152/// Allows to remap the internal item indexes to the original string identifiers
153pub struct Renaming {
154    item_names: FnvHashMap<u32, String>,
155}
156
157impl Renaming {
158    /// Return original string identifier for the internal index `item_index`
159    pub fn item_name(&self, item_index: u32) -> &str {
160        &self.item_names[&item_index]
161    }
162}
163
164/// Consume a DataDictionary to produce a Renaming for the reverse mapping
165impl From<DataDictionary> for Renaming {
166
167    fn from(data_dict: DataDictionary) -> Self {
168        let item_names: FnvHashMap<u32, String> = data_dict
169            .item_dict
170            .into_iter()
171            .map(|(name, item_id)| (item_id, name))
172            .collect(); // Checked that size_hint() gives correct bounds
173
174        Renaming { item_names }
175    }
176}
177
178
179#[cfg(test)]
180mod tests {
181
182    extern crate fnv;
183
184    use fnv::FnvHashMap;
185    use stats::{DataDictionary, Renaming};
186
187    #[test]
188    fn dict_from_tuple_iterator() {
189
190        let interactions = vec![
191            (String::from("user_a"), String::from("item_a")),
192            (String::from("user_a"), String::from("item_b")),
193            (String::from("user_b"), String::from("item_b")),
194            (String::from("user_c"), String::from("item_a")),
195        ];
196
197        let data_dict = DataDictionary::from(interactions.iter());
198
199        assert_eq!(data_dict.num_users(), 3);
200        assert_eq!(data_dict.num_items(), 2);
201        assert_eq!(data_dict.num_interactions(), 4);
202
203        assert_eq!(*data_dict.user_index("user_a"), 0);
204        assert_eq!(*data_dict.user_index("user_c"), 2);
205
206        assert_eq!(*data_dict.item_index("item_a"), 0);
207        assert_eq!(*data_dict.item_index("item_b"), 1);
208
209        // Make sure we don't lose ownership of interactions
210        assert_eq!(interactions.len(), 4);
211    }
212
213    #[test]
214    fn dict_from_owned_tuple_iterator() {
215
216        let interactions = vec![
217            (String::from("user_a"), String::from("item_a")),
218            (String::from("user_a"), String::from("item_b")),
219            (String::from("user_b"), String::from("item_b")),
220            (String::from("user_c"), String::from("item_a")),
221        ];
222
223        let data_dict = DataDictionary::from_owned(interactions.into_iter());
224
225        assert_eq!(data_dict.num_users(), 3);
226        assert_eq!(data_dict.num_items(), 2);
227        assert_eq!(data_dict.num_interactions(), 4);
228
229        assert_eq!(*data_dict.user_index("user_a"), 0);
230        assert_eq!(*data_dict.user_index("user_c"), 2);
231
232        assert_eq!(*data_dict.item_index("item_a"), 0);
233        assert_eq!(*data_dict.item_index("item_b"), 1);
234    }
235
236    #[test]
237    fn renaming_from_dict() {
238
239        let user_mapping = vec![
240            (String::from("user_a"), 0),
241            (String::from("user_b"), 1),
242        ];
243
244        let item_mapping = vec![
245            (String::from("item_a"), 0),
246            (String::from("item_b"), 1),
247            (String::from("item_c"), 2),
248        ];
249
250        let user_dict: FnvHashMap<String, u32> = user_mapping.into_iter().collect();
251        let item_dict: FnvHashMap<String, u32> = item_mapping.into_iter().collect();
252
253        let data_dict = DataDictionary { user_dict, item_dict, num_interactions: 10 };
254
255        let renaming: Renaming = data_dict.into();
256
257        assert_eq!(renaming.item_name(0), "item_a");
258        assert_eq!(renaming.item_name(1), "item_b");
259        assert_eq!(renaming.item_name(2), "item_c");
260    }
261}