tf_idf_vectorizer/vectorizer/evaluate/
query.rs1use crate::{TermFrequency, utils::datastruct::map::{IndexMap, IndexSet}};
2
3#[derive(Clone, Debug)]
4pub enum QueryInner {
5 None,
6 All,
7 Nop(Box<str>),
8 Not(Box<QueryInner>),
9 And(Box<QueryInner>, Box<QueryInner>),
10 Or(Box<QueryInner>, Box<QueryInner>),
11}
12
13#[derive(Clone, Debug)]
17pub struct Query {
18 pub(crate) inner: QueryInner,
19}
20
21impl Query {
22 pub fn none() -> Self {
23 Query { inner: QueryInner::None }
24 }
25
26 pub fn all() -> Self {
27 Query { inner: QueryInner::All }
28 }
29
30 pub fn term<S>(term: &S) -> Self
31 where
32 S: AsRef<str> + ?Sized,
33 {
34 Query { inner: QueryInner::Nop(Box::from(term.as_ref())) }
35 }
36
37 pub fn not(order: Query) -> Self {
38 Query { inner: QueryInner::Not(Box::new(order.inner)) }
39 }
40
41 pub fn and(left: Query, right: Query) -> Self {
42 Query { inner: QueryInner::And(Box::new(left.inner), Box::new(right.inner)) }
43 }
44
45 pub fn or(left: Query, right: Query) -> Self {
46 Query { inner: QueryInner::Or(Box::new(left.inner), Box::new(right.inner)) }
47 }
48
49 pub fn from_freq_or(freq: &TermFrequency) -> Self {
50 let mut iter = freq.term_set_iter();
51 if let Some(first_term) = iter.next() {
52 let mut query = Query::term(first_term);
53 for term in iter {
54 let term_query = Query::term(term);
55 query = Query::or(query, term_query);
56 }
57 query
58 } else {
59 Query::none()
60 }
61 }
62
63 pub fn from_freq_and(freq: &TermFrequency) -> Self {
64 let mut iter = freq.term_set_iter();
65 if let Some(first_term) = iter.next() {
66 let mut query = Query::term(first_term);
67 for term in iter {
68 let term_query = Query::term(term);
69 query = Query::and(query, term_query);
70 }
71 query
72 } else {
73 Query::none()
74 }
75 }
76
77 pub fn get_all_terms(&self) -> Vec<&str> {
78 let mut terms = Vec::new();
79 Self::collect_terms_ref(&self.inner, &mut terms);
80 terms
81 }
82
83 pub(crate) fn collect_terms_ref<'a>(query: &'a QueryInner, terms: &mut Vec<&'a str>) {
84 match query {
85 QueryInner::All => {
86 }
88 QueryInner::None => {}
89 QueryInner::Nop(term) => {
90 terms.push(term);
91 }
92 QueryInner::Not(inner) => {
93 Self::collect_terms_ref(inner, terms);
94 }
95 QueryInner::And(left, right) => {
96 Self::collect_terms_ref(left, terms);
97 Self::collect_terms_ref(right, terms);
98 }
99 QueryInner::Or(left, right) => {
100 Self::collect_terms_ref(left, terms);
101 Self::collect_terms_ref(right, terms);
102 }
103 }
104 }
105
106 pub(crate) fn build_ref<K>(query: &QueryInner, term_dim_rev_index: &IndexMap<Box<str>, Vec<u32>>, documents: &IndexSet<K>) -> Vec<usize>
107 where
108 K: Eq + std::hash::Hash,
109 {
110 match query {
111 QueryInner::All => {
112 let mut result = Vec::with_capacity(documents.len());
113 for (idx, _) in documents.iter().enumerate() {
114 result.push(idx);
115 }
116 result
117 }
118 QueryInner::None => Vec::new(),
119 QueryInner::Nop(term) => {
120 if let Some(doc_keys) = term_dim_rev_index.get(term) {
121 let mut result = doc_keys.iter().map(|&id| id as usize).collect::<Vec<usize>>();
122 result.sort_unstable();
123 result
124 } else {
125 Vec::new()
126 }
127 }
128 QueryInner::Not(inner) => {
129 let inner_indices = Self::build_ref(inner, term_dim_rev_index, documents);
130 let mut result = Vec::with_capacity(documents.len() - inner_indices.len());
131 let mut inner_iter = inner_indices.iter().peekable();
132 for (idx, _) in documents.iter().enumerate() {
133 match inner_iter.peek() {
134 Some(&&inner_idx) if inner_idx == idx => {
135 inner_iter.next();
136 }
137 _ => {
138 result.push(idx);
139 }
140 }
141 }
142 result
143 }
144 QueryInner::And(left, right) => {
145 let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
146 let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
147 let mut result = Vec::with_capacity(std::cmp::min(left_indices.len(), right_indices.len()));
148 let mut l = 0;
149 let mut r = 0;
150 while l < left_indices.len() && r < right_indices.len() {
151 match left_indices[l].cmp(&right_indices[r]) {
152 std::cmp::Ordering::Less => {
153 l += 1;
154 }
155 std::cmp::Ordering::Greater => {
156 r += 1;
157 }
158 std::cmp::Ordering::Equal => {
159 result.push(left_indices[l]);
160 l += 1;
161 r += 1;
162 }
163 }
164 }
165 result
166 }
167 QueryInner::Or(left, right) => {
168 let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
169 let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
170 let mut result = Vec::with_capacity(left_indices.len() + right_indices.len());
171 let mut l = 0;
172 let mut r = 0;
173 while l < left_indices.len() || r < right_indices.len() {
174 if l >= left_indices.len() {
175 result.push(right_indices[r]);
176 r += 1;
177 } else if r >= right_indices.len() {
178 result.push(left_indices[l]);
179 l += 1;
180 } else {
181 match left_indices[l].cmp(&right_indices[r]) {
182 std::cmp::Ordering::Less => {
183 result.push(left_indices[l]);
184 l += 1;
185 }
186 std::cmp::Ordering::Greater => {
187 result.push(right_indices[r]);
188 r += 1;
189 }
190 std::cmp::Ordering::Equal => {
191 result.push(left_indices[l]);
192 l += 1;
193 r += 1;
194 }
195 }
196 }
197 }
198 result
199 }
200 }
201 }
202
203 pub fn build<K>(&self, term_dim_rev_index: &IndexMap<Box<str>, Vec<u32>>, documents: &IndexSet<K>) -> Vec<usize>
204 where
205 K: Eq + std::hash::Hash,
206 {
207 let mut res = Self::build_ref(&self.inner, term_dim_rev_index, documents);
208 res.sort_unstable();
209 res.dedup();
210 res
211 }
212}