tf_idf_vectorizer/vectorizer/evaluate/
query.rs

1use crate::{TokenFrequency, utils::datastruct::map::{IndexMap, IndexSet}, vectorizer::KeyRc};
2
3pub struct QueryBuilder<'a, K> {
4    pub token_dim_rev_index: &'a IndexMap<Box<str>, Vec<KeyRc<K>>>,
5    pub documents: &'a IndexSet<KeyRc<K>>,
6    pub scan_doc_indices: Vec<usize>,
7}
8
9pub enum OrderDocIdx {
10    Nop(Box<str>),
11    Not(Box<OrderDocIdx>),
12    And(Box<OrderDocIdx>, Box<OrderDocIdx>),
13    Or(Box<OrderDocIdx>, Box<OrderDocIdx>),
14}
15
16pub struct Query {
17    pub(crate) doc_indices: Vec<usize>,
18    pub(crate) token_freq: TokenFrequency,
19}
20
21impl<'a, K> QueryBuilder<'a, K>
22where
23    K: Clone + Eq + std::hash::Hash,
24{
25    pub fn new(
26        token_dim_rev_index: &'a IndexMap<Box<str>, Vec<KeyRc<K>>>,
27        documents: &'a IndexSet<KeyRc<K>>,
28    ) -> Self {
29        Self {
30            token_dim_rev_index,
31            documents,
32            scan_doc_indices: Vec::new(),
33        }
34    }
35
36    fn list_of_contains_docs(&self, token: &str) -> Vec<usize> {
37        self.token_dim_rev_index.get(token).map(|keys| {
38            keys.iter().filter_map(|key| {
39                self.documents.get_index(key)
40            }).collect::<Vec<usize>>()
41        }).unwrap_or_else(Vec::new)
42    }
43
44    pub fn token(&self, token: &str) -> OrderDocIdx {
45        OrderDocIdx::Nop(Box::from(token))
46    }
47
48    pub fn not(&self, order: OrderDocIdx) -> OrderDocIdx {
49        OrderDocIdx::Not(Box::new(order))
50    }
51
52    pub fn and(&self, left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
53        OrderDocIdx::And(Box::new(left), Box::new(right))
54    }
55
56    pub fn or(&self, left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
57        OrderDocIdx::Or(Box::new(left), Box::new(right))
58    }
59    fn build_ref(&self, order: OrderDocIdx, freq: &mut TokenFrequency) -> Vec<usize> {
60        match order {
61            OrderDocIdx::Nop(token) => {
62                freq.add_token(token.as_ref());
63                // token is Box<str>, so we need to look up the doc indices
64                let mut indices = self.list_of_contains_docs(&token);
65                indices.sort_unstable();
66                indices.dedup();
67                indices
68            }
69            OrderDocIdx::Not(inner) => {
70                let inner_indices = self.build_ref(*inner, freq);
71                let mut result = Vec::new();
72                let mut inner_iter = inner_indices.into_iter().peekable();
73                let mut next_inner = inner_iter.peek().copied();
74                for idx in 0..self.documents.len() {
75                    match next_inner {
76                        Some(inner_idx) if inner_idx == idx => {
77                            inner_iter.next();
78                            next_inner = inner_iter.peek().copied();
79                        }
80                        _ => result.push(idx),
81                    }
82                }
83                result
84            }
85            OrderDocIdx::And(left, right) => {
86                let left_indices = self.build_ref(*left, freq);
87                let right_indices = self.build_ref(*right, freq);
88                let mut result = Vec::new();
89                let mut l = 0;
90                let mut r = 0;
91                while l < left_indices.len() && r < right_indices.len() {
92                    match left_indices[l].cmp(&right_indices[r]) {
93                        std::cmp::Ordering::Less => l += 1,
94                        std::cmp::Ordering::Greater => r += 1,
95                        std::cmp::Ordering::Equal => {
96                            result.push(left_indices[l]);
97                            l += 1;
98                            r += 1;
99                        }
100                    }
101                }
102                result
103            }
104            OrderDocIdx::Or(left, right) => {
105                let left_indices = self.build_ref(*left, freq);
106                let right_indices = self.build_ref(*right, freq);
107                let mut result = Vec::with_capacity(left_indices.len() + right_indices.len());
108                let mut l = 0;
109                let mut r = 0;
110                while l < left_indices.len() && r < right_indices.len() {
111                    match left_indices[l].cmp(&right_indices[r]) {
112                        std::cmp::Ordering::Less => {
113                            result.push(left_indices[l]);
114                            l += 1;
115                        }
116                        std::cmp::Ordering::Greater => {
117                            result.push(right_indices[r]);
118                            r += 1;
119                        }
120                        std::cmp::Ordering::Equal => {
121                            result.push(left_indices[l]);
122                            l += 1;
123                            r += 1;
124                        }
125                    }
126                }
127                while l < left_indices.len() {
128                    result.push(left_indices[l]);
129                    l += 1;
130                }
131                while r < right_indices.len() {
132                    result.push(right_indices[r]);
133                    r += 1;
134                }
135                result
136            }
137        }
138    }
139
140    pub fn build<F>(self, order: F) -> Query 
141    where 
142        F: FnOnce(&QueryBuilder<'a, K>) -> OrderDocIdx
143    {
144        let mut freq = TokenFrequency::new();
145        let order = order(&self);
146        let doc_indices = self.build_ref(order, &mut freq);
147        Query {
148            doc_indices,
149            token_freq: freq,
150        }
151    }
152
153    pub fn build_with_freq<F>(self, order: F, freq: TokenFrequency) -> Query 
154    where 
155        F: FnOnce(&QueryBuilder<'a, K>) -> OrderDocIdx
156    {
157        let doc_indices = self.build_ref(order(&self), &mut freq.clone()); // this freq is not used
158        Query {
159            doc_indices,
160            token_freq: freq,
161        }
162    }
163    
164    pub fn build_with_order(self, order: OrderDocIdx) -> Query 
165    where 
166        K: Clone + Eq + std::hash::Hash,
167    {
168        let mut freq = TokenFrequency::new();
169        let doc_indices = self.build_ref(order, &mut freq);
170        Query {
171            doc_indices,
172            token_freq: freq,
173        }
174    }
175
176    pub fn build_with_order_and_freq(self, order: OrderDocIdx, freq: TokenFrequency) -> Query 
177    where 
178        K: Clone + Eq + std::hash::Hash,
179    {
180        let doc_indices = self.build_ref(order, &mut freq.clone()); // this freq is not used
181        Query {
182            doc_indices,
183            token_freq: freq,
184        }
185    }
186}
187
188pub mod q {
189    use crate::vectorizer::evaluate::query::OrderDocIdx;
190
191    pub fn token(token: &str) -> OrderDocIdx {
192        OrderDocIdx::Nop(Box::from(token))
193    }
194
195    pub fn not(order: OrderDocIdx) -> OrderDocIdx {
196        OrderDocIdx::Not(Box::new(order))
197    }
198
199    pub fn and(left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
200        OrderDocIdx::And(Box::new(left), Box::new(right))
201    }
202
203    pub fn or(left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
204        OrderDocIdx::Or(Box::new(left), Box::new(right))
205    }
206}