tf_idf_vectorizer/vectorizer/evaluate/
query.rs1use crate::{TokenFrequency, utils::datastruct::map::{IndexMap, IndexSet}, vectorizer::KeyRc};
2
3pub struct QueryBuilder<'a, K> {
4 pub token_dim_rev_index: &'a IndexMap<Box<str>, Vec<KeyRc<K>>>,
5 pub documents: &'a IndexSet<KeyRc<K>>,
6 pub scan_doc_indices: Vec<usize>,
7}
8
9pub enum OrderDocIdx {
10 Nop(Box<str>),
11 Not(Box<OrderDocIdx>),
12 And(Box<OrderDocIdx>, Box<OrderDocIdx>),
13 Or(Box<OrderDocIdx>, Box<OrderDocIdx>),
14}
15
16pub struct Query {
17 pub(crate) doc_indices: Vec<usize>,
18 pub(crate) token_freq: TokenFrequency,
19}
20
21impl<'a, K> QueryBuilder<'a, K>
22where
23 K: Clone + Eq + std::hash::Hash,
24{
25 pub fn new(
26 token_dim_rev_index: &'a IndexMap<Box<str>, Vec<KeyRc<K>>>,
27 documents: &'a IndexSet<KeyRc<K>>,
28 ) -> Self {
29 Self {
30 token_dim_rev_index,
31 documents,
32 scan_doc_indices: Vec::new(),
33 }
34 }
35
36 fn list_of_contains_docs(&self, token: &str) -> Vec<usize> {
37 self.token_dim_rev_index.get(token).map(|keys| {
38 keys.iter().filter_map(|key| {
39 self.documents.get_index(key)
40 }).collect::<Vec<usize>>()
41 }).unwrap_or_else(Vec::new)
42 }
43
44 pub fn token(&self, token: &str) -> OrderDocIdx {
45 OrderDocIdx::Nop(Box::from(token))
46 }
47
48 pub fn not(&self, order: OrderDocIdx) -> OrderDocIdx {
49 OrderDocIdx::Not(Box::new(order))
50 }
51
52 pub fn and(&self, left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
53 OrderDocIdx::And(Box::new(left), Box::new(right))
54 }
55
56 pub fn or(&self, left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
57 OrderDocIdx::Or(Box::new(left), Box::new(right))
58 }
59 fn build_ref(&self, order: OrderDocIdx, freq: &mut TokenFrequency) -> Vec<usize> {
60 match order {
61 OrderDocIdx::Nop(token) => {
62 freq.add_token(token.as_ref());
63 let mut indices = self.list_of_contains_docs(&token);
65 indices.sort_unstable();
66 indices.dedup();
67 indices
68 }
69 OrderDocIdx::Not(inner) => {
70 let inner_indices = self.build_ref(*inner, freq);
71 let mut result = Vec::new();
72 let mut inner_iter = inner_indices.into_iter().peekable();
73 let mut next_inner = inner_iter.peek().copied();
74 for idx in 0..self.documents.len() {
75 match next_inner {
76 Some(inner_idx) if inner_idx == idx => {
77 inner_iter.next();
78 next_inner = inner_iter.peek().copied();
79 }
80 _ => result.push(idx),
81 }
82 }
83 result
84 }
85 OrderDocIdx::And(left, right) => {
86 let left_indices = self.build_ref(*left, freq);
87 let right_indices = self.build_ref(*right, freq);
88 let mut result = Vec::new();
89 let mut l = 0;
90 let mut r = 0;
91 while l < left_indices.len() && r < right_indices.len() {
92 match left_indices[l].cmp(&right_indices[r]) {
93 std::cmp::Ordering::Less => l += 1,
94 std::cmp::Ordering::Greater => r += 1,
95 std::cmp::Ordering::Equal => {
96 result.push(left_indices[l]);
97 l += 1;
98 r += 1;
99 }
100 }
101 }
102 result
103 }
104 OrderDocIdx::Or(left, right) => {
105 let left_indices = self.build_ref(*left, freq);
106 let right_indices = self.build_ref(*right, freq);
107 let mut result = Vec::with_capacity(left_indices.len() + right_indices.len());
108 let mut l = 0;
109 let mut r = 0;
110 while l < left_indices.len() && r < right_indices.len() {
111 match left_indices[l].cmp(&right_indices[r]) {
112 std::cmp::Ordering::Less => {
113 result.push(left_indices[l]);
114 l += 1;
115 }
116 std::cmp::Ordering::Greater => {
117 result.push(right_indices[r]);
118 r += 1;
119 }
120 std::cmp::Ordering::Equal => {
121 result.push(left_indices[l]);
122 l += 1;
123 r += 1;
124 }
125 }
126 }
127 while l < left_indices.len() {
128 result.push(left_indices[l]);
129 l += 1;
130 }
131 while r < right_indices.len() {
132 result.push(right_indices[r]);
133 r += 1;
134 }
135 result
136 }
137 }
138 }
139
140 pub fn build<F>(self, order: F) -> Query
141 where
142 F: FnOnce(&QueryBuilder<'a, K>) -> OrderDocIdx
143 {
144 let mut freq = TokenFrequency::new();
145 let order = order(&self);
146 let doc_indices = self.build_ref(order, &mut freq);
147 Query {
148 doc_indices,
149 token_freq: freq,
150 }
151 }
152
153 pub fn build_with_freq<F>(self, order: F, freq: TokenFrequency) -> Query
154 where
155 F: FnOnce(&QueryBuilder<'a, K>) -> OrderDocIdx
156 {
157 let doc_indices = self.build_ref(order(&self), &mut freq.clone()); Query {
159 doc_indices,
160 token_freq: freq,
161 }
162 }
163
164 pub fn build_with_order(self, order: OrderDocIdx) -> Query
165 where
166 K: Clone + Eq + std::hash::Hash,
167 {
168 let mut freq = TokenFrequency::new();
169 let doc_indices = self.build_ref(order, &mut freq);
170 Query {
171 doc_indices,
172 token_freq: freq,
173 }
174 }
175
176 pub fn build_with_order_and_freq(self, order: OrderDocIdx, freq: TokenFrequency) -> Query
177 where
178 K: Clone + Eq + std::hash::Hash,
179 {
180 let doc_indices = self.build_ref(order, &mut freq.clone()); Query {
182 doc_indices,
183 token_freq: freq,
184 }
185 }
186}
187
188pub mod q {
189 use crate::vectorizer::evaluate::query::OrderDocIdx;
190
191 pub fn token(token: &str) -> OrderDocIdx {
192 OrderDocIdx::Nop(Box::from(token))
193 }
194
195 pub fn not(order: OrderDocIdx) -> OrderDocIdx {
196 OrderDocIdx::Not(Box::new(order))
197 }
198
199 pub fn and(left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
200 OrderDocIdx::And(Box::new(left), Box::new(right))
201 }
202
203 pub fn or(left: OrderDocIdx, right: OrderDocIdx) -> OrderDocIdx {
204 OrderDocIdx::Or(Box::new(left), Box::new(right))
205 }
206}