tf_idf_vectorizer/vectorizer/evaluate/
query.rs

1use crate::{TermFrequency, utils::datastruct::map::{IndexMap, IndexSet}, vectorizer::KeyRc};
2
3#[derive(Clone, Debug)]
4pub enum QueryInner {
5    None,
6    All,
7    Nop(Box<str>),
8    Not(Box<QueryInner>),
9    And(Box<QueryInner>, Box<QueryInner>),
10    Or(Box<QueryInner>, Box<QueryInner>),
11}
12
13/// Query Structure
14///
15/// Represents a search query with logical filtering conditions.
16#[derive(Clone, Debug)]
17pub struct Query {
18    pub(crate) inner: QueryInner,
19}
20
21impl Query {
22    pub fn none() -> Self {
23        Query { inner: QueryInner::None }
24    }
25
26    pub fn all() -> Self {
27        Query { inner: QueryInner::All }
28    }
29
30    pub fn term<S>(term: &S) -> Self 
31    where
32        S: AsRef<str> + ?Sized,
33    {
34        Query { inner: QueryInner::Nop(Box::from(term.as_ref())) }
35    }
36
37    pub fn not(order: Query) -> Self {
38        Query { inner: QueryInner::Not(Box::new(order.inner)) }
39    }
40
41    pub fn and(left: Query, right: Query) -> Self {
42        Query { inner: QueryInner::And(Box::new(left.inner), Box::new(right.inner)) }
43    }
44
45    pub fn or(left: Query, right: Query) -> Self {
46        Query { inner: QueryInner::Or(Box::new(left.inner), Box::new(right.inner)) }
47    }
48
49    pub fn from_freq_or(freq: &TermFrequency) -> Self {
50        let mut iter = freq.term_set_iter();
51        if let Some(first_term) = iter.next() {
52            let mut query = Query::term(first_term);
53            for term in iter {
54                let term_query = Query::term(term);
55                query = Query::or(query, term_query);
56            }
57            query
58        } else {
59            Query::none()
60        }
61    }
62
63    pub fn from_freq_and(freq: &TermFrequency) -> Self {
64        let mut iter = freq.term_set_iter();
65        if let Some(first_term) = iter.next() {
66            let mut query = Query::term(first_term);
67            for term in iter {
68                let term_query = Query::term(term);
69                query = Query::and(query, term_query);
70            }
71            query
72        } else {
73            Query::none()
74        }
75    }
76
77    pub fn get_all_terms(&self) -> Vec<&str> {
78        let mut terms = Vec::new();
79        Self::collect_terms_ref(&self.inner, &mut terms);
80        terms
81    }
82
83    pub(crate) fn collect_terms_ref<'a>(query: &'a QueryInner, terms: &mut Vec<&'a str>) {
84        match query {
85            QueryInner::All => {
86                // do nothing
87            }
88            QueryInner::None => {}
89            QueryInner::Nop(term) => {
90                terms.push(term);
91            }
92            QueryInner::Not(inner) => {
93                Self::collect_terms_ref(inner, terms);
94            }
95            QueryInner::And(left, right) => {
96                Self::collect_terms_ref(left, terms);
97                Self::collect_terms_ref(right, terms);
98            }
99            QueryInner::Or(left, right) => {
100                Self::collect_terms_ref(left, terms);
101                Self::collect_terms_ref(right, terms);
102            }
103        }
104    }
105
106    pub(crate) fn build_ref<K>(query: &QueryInner, term_dim_rev_index: &IndexMap<Box<str>, Vec<KeyRc<K>>>, documents: &IndexSet<KeyRc<K>>) -> Vec<usize> 
107    where 
108        K: Eq + std::hash::Hash,
109    {
110        match query {
111            QueryInner::All => {
112                let mut result = Vec::with_capacity(documents.len());
113                for (idx, _) in documents.iter().enumerate() {
114                    result.push(idx);
115                }
116                result
117            }
118            QueryInner::None => Vec::new(),
119            QueryInner::Nop(term) => {
120                if let Some(doc_keys) = term_dim_rev_index.get(term) {
121                    let mut result = Vec::with_capacity(doc_keys.len());
122                    for doc_key in doc_keys {
123                        if let Some(idx) = documents.get_index(doc_key) {
124                            result.push(idx);
125                        }
126                    }
127                    result.sort_unstable();
128                    result
129                } else {
130                    Vec::new()
131                }
132            }
133            QueryInner::Not(inner) => {
134                let inner_indices = Self::build_ref(inner, term_dim_rev_index, documents);
135                let mut result = Vec::with_capacity(documents.len() - inner_indices.len());
136                let mut inner_iter = inner_indices.iter().peekable();
137                for (idx, _) in documents.iter().enumerate() {
138                    match inner_iter.peek() {
139                        Some(&&inner_idx) if inner_idx == idx => {
140                            inner_iter.next();
141                        }
142                        _ => {
143                            result.push(idx);
144                        }
145                    }
146                }
147                result
148            }
149            QueryInner::And(left, right) => {
150                let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
151                let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
152                let mut result = Vec::with_capacity(std::cmp::min(left_indices.len(), right_indices.len()));
153                let mut l = 0;
154                let mut r = 0;
155                while l < left_indices.len() && r < right_indices.len() {
156                    match left_indices[l].cmp(&right_indices[r]) {
157                        std::cmp::Ordering::Less => {
158                            l += 1;
159                        }
160                        std::cmp::Ordering::Greater => {
161                            r += 1;
162                        }
163                        std::cmp::Ordering::Equal => {
164                            result.push(left_indices[l]);
165                            l += 1;
166                            r += 1;
167                        }
168                    }
169                }
170                result
171            }
172            QueryInner::Or(left, right) => {
173                let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
174                let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
175                let mut result = Vec::with_capacity(left_indices.len() + right_indices.len());
176                let mut l = 0;
177                let mut r = 0;
178                while l < left_indices.len() || r < right_indices.len() {
179                    if l >= left_indices.len() {
180                        result.push(right_indices[r]);
181                        r += 1;
182                    } else if r >= right_indices.len() {
183                        result.push(left_indices[l]);
184                        l += 1;
185                    } else {
186                        match left_indices[l].cmp(&right_indices[r]) {
187                            std::cmp::Ordering::Less => {
188                                result.push(left_indices[l]);
189                                l += 1;
190                            }
191                            std::cmp::Ordering::Greater => {
192                                result.push(right_indices[r]);
193                                r += 1;
194                            }
195                            std::cmp::Ordering::Equal => {
196                                result.push(left_indices[l]);
197                                l += 1;
198                                r += 1;
199                            }
200                        }
201                    }
202                }
203                result
204            }
205        }
206    }
207
208    pub fn build<K>(&self, term_dim_rev_index: &IndexMap<Box<str>, Vec<KeyRc<K>>>, documents: &IndexSet<KeyRc<K>>) -> Vec<usize> 
209    where 
210        K: Eq + std::hash::Hash,
211    {
212        let mut res = Self::build_ref(&self.inner, term_dim_rev_index, documents);
213        res.sort_unstable();
214        res.dedup();
215        res
216    }
217}