tf_idf_vectorizer/vectorizer/evaluate/
query.rs1use crate::{TermFrequency, utils::datastruct::map::{IndexMap, IndexSet}, vectorizer::KeyRc};
2
3#[derive(Clone, Debug)]
4pub enum QueryInner {
5 None,
6 All,
7 Nop(Box<str>),
8 Not(Box<QueryInner>),
9 And(Box<QueryInner>, Box<QueryInner>),
10 Or(Box<QueryInner>, Box<QueryInner>),
11}
12
13#[derive(Clone, Debug)]
17pub struct Query {
18 pub(crate) inner: QueryInner,
19}
20
21impl Query {
22 pub fn none() -> Self {
23 Query { inner: QueryInner::None }
24 }
25
26 pub fn all() -> Self {
27 Query { inner: QueryInner::All }
28 }
29
30 pub fn term<S>(term: &S) -> Self
31 where
32 S: AsRef<str> + ?Sized,
33 {
34 Query { inner: QueryInner::Nop(Box::from(term.as_ref())) }
35 }
36
37 pub fn not(order: Query) -> Self {
38 Query { inner: QueryInner::Not(Box::new(order.inner)) }
39 }
40
41 pub fn and(left: Query, right: Query) -> Self {
42 Query { inner: QueryInner::And(Box::new(left.inner), Box::new(right.inner)) }
43 }
44
45 pub fn or(left: Query, right: Query) -> Self {
46 Query { inner: QueryInner::Or(Box::new(left.inner), Box::new(right.inner)) }
47 }
48
49 pub fn from_freq_or(freq: &TermFrequency) -> Self {
50 let mut iter = freq.term_set_iter();
51 if let Some(first_term) = iter.next() {
52 let mut query = Query::term(first_term);
53 for term in iter {
54 let term_query = Query::term(term);
55 query = Query::or(query, term_query);
56 }
57 query
58 } else {
59 Query::none()
60 }
61 }
62
63 pub fn from_freq_and(freq: &TermFrequency) -> Self {
64 let mut iter = freq.term_set_iter();
65 if let Some(first_term) = iter.next() {
66 let mut query = Query::term(first_term);
67 for term in iter {
68 let term_query = Query::term(term);
69 query = Query::and(query, term_query);
70 }
71 query
72 } else {
73 Query::none()
74 }
75 }
76
77 pub fn get_all_terms(&self) -> Vec<&str> {
78 let mut terms = Vec::new();
79 Self::collect_terms_ref(&self.inner, &mut terms);
80 terms
81 }
82
83 pub(crate) fn collect_terms_ref<'a>(query: &'a QueryInner, terms: &mut Vec<&'a str>) {
84 match query {
85 QueryInner::All => {
86 }
88 QueryInner::None => {}
89 QueryInner::Nop(term) => {
90 terms.push(term);
91 }
92 QueryInner::Not(inner) => {
93 Self::collect_terms_ref(inner, terms);
94 }
95 QueryInner::And(left, right) => {
96 Self::collect_terms_ref(left, terms);
97 Self::collect_terms_ref(right, terms);
98 }
99 QueryInner::Or(left, right) => {
100 Self::collect_terms_ref(left, terms);
101 Self::collect_terms_ref(right, terms);
102 }
103 }
104 }
105
106 pub(crate) fn build_ref<K>(query: &QueryInner, term_dim_rev_index: &IndexMap<Box<str>, Vec<KeyRc<K>>>, documents: &IndexSet<KeyRc<K>>) -> Vec<usize>
107 where
108 K: Eq + std::hash::Hash,
109 {
110 match query {
111 QueryInner::All => {
112 let mut result = Vec::with_capacity(documents.len());
113 for (idx, _) in documents.iter().enumerate() {
114 result.push(idx);
115 }
116 result
117 }
118 QueryInner::None => Vec::new(),
119 QueryInner::Nop(term) => {
120 if let Some(doc_keys) = term_dim_rev_index.get(term) {
121 let mut result = Vec::with_capacity(doc_keys.len());
122 for doc_key in doc_keys {
123 if let Some(idx) = documents.get_index(doc_key) {
124 result.push(idx);
125 }
126 }
127 result.sort_unstable();
128 result
129 } else {
130 Vec::new()
131 }
132 }
133 QueryInner::Not(inner) => {
134 let inner_indices = Self::build_ref(inner, term_dim_rev_index, documents);
135 let mut result = Vec::with_capacity(documents.len() - inner_indices.len());
136 let mut inner_iter = inner_indices.iter().peekable();
137 for (idx, _) in documents.iter().enumerate() {
138 match inner_iter.peek() {
139 Some(&&inner_idx) if inner_idx == idx => {
140 inner_iter.next();
141 }
142 _ => {
143 result.push(idx);
144 }
145 }
146 }
147 result
148 }
149 QueryInner::And(left, right) => {
150 let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
151 let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
152 let mut result = Vec::with_capacity(std::cmp::min(left_indices.len(), right_indices.len()));
153 let mut l = 0;
154 let mut r = 0;
155 while l < left_indices.len() && r < right_indices.len() {
156 match left_indices[l].cmp(&right_indices[r]) {
157 std::cmp::Ordering::Less => {
158 l += 1;
159 }
160 std::cmp::Ordering::Greater => {
161 r += 1;
162 }
163 std::cmp::Ordering::Equal => {
164 result.push(left_indices[l]);
165 l += 1;
166 r += 1;
167 }
168 }
169 }
170 result
171 }
172 QueryInner::Or(left, right) => {
173 let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
174 let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
175 let mut result = Vec::with_capacity(left_indices.len() + right_indices.len());
176 let mut l = 0;
177 let mut r = 0;
178 while l < left_indices.len() || r < right_indices.len() {
179 if l >= left_indices.len() {
180 result.push(right_indices[r]);
181 r += 1;
182 } else if r >= right_indices.len() {
183 result.push(left_indices[l]);
184 l += 1;
185 } else {
186 match left_indices[l].cmp(&right_indices[r]) {
187 std::cmp::Ordering::Less => {
188 result.push(left_indices[l]);
189 l += 1;
190 }
191 std::cmp::Ordering::Greater => {
192 result.push(right_indices[r]);
193 r += 1;
194 }
195 std::cmp::Ordering::Equal => {
196 result.push(left_indices[l]);
197 l += 1;
198 r += 1;
199 }
200 }
201 }
202 }
203 result
204 }
205 }
206 }
207
208 pub fn build<K>(&self, term_dim_rev_index: &IndexMap<Box<str>, Vec<KeyRc<K>>>, documents: &IndexSet<KeyRc<K>>) -> Vec<usize>
209 where
210 K: Eq + std::hash::Hash,
211 {
212 let mut res = Self::build_ref(&self.inner, term_dim_rev_index, documents);
213 res.sort_unstable();
214 res.dedup();
215 res
216 }
217}