Skip to main content

tf_idf_vectorizer/vectorizer/evaluate/
query.rs

1use crate::{TermFrequency, utils::datastruct::map::{IndexMap, IndexSet}};
2
3#[derive(Clone, Debug)]
4pub enum QueryInner {
5    None,
6    All,
7    Nop(Box<str>),
8    Not(Box<QueryInner>),
9    And(Box<QueryInner>, Box<QueryInner>),
10    Or(Box<QueryInner>, Box<QueryInner>),
11}
12
13/// Query Structure
14///
15/// Represents a search query with logical filtering conditions.
16#[derive(Clone, Debug)]
17pub struct Query {
18    pub(crate) inner: QueryInner,
19}
20
21impl Query {
22    pub fn none() -> Self {
23        Query { inner: QueryInner::None }
24    }
25
26    pub fn all() -> Self {
27        Query { inner: QueryInner::All }
28    }
29
30    pub fn term<S>(term: &S) -> Self 
31    where
32        S: AsRef<str> + ?Sized,
33    {
34        Query { inner: QueryInner::Nop(Box::from(term.as_ref())) }
35    }
36
37    pub fn not(order: Query) -> Self {
38        Query { inner: QueryInner::Not(Box::new(order.inner)) }
39    }
40
41    pub fn and(left: Query, right: Query) -> Self {
42        Query { inner: QueryInner::And(Box::new(left.inner), Box::new(right.inner)) }
43    }
44
45    pub fn or(left: Query, right: Query) -> Self {
46        Query { inner: QueryInner::Or(Box::new(left.inner), Box::new(right.inner)) }
47    }
48
49    pub fn from_freq_or(freq: &TermFrequency) -> Self {
50        let mut iter = freq.term_set_iter();
51        if let Some(first_term) = iter.next() {
52            let mut query = Query::term(first_term);
53            for term in iter {
54                let term_query = Query::term(term);
55                query = Query::or(query, term_query);
56            }
57            query
58        } else {
59            Query::none()
60        }
61    }
62
63    pub fn from_freq_and(freq: &TermFrequency) -> Self {
64        let mut iter = freq.term_set_iter();
65        if let Some(first_term) = iter.next() {
66            let mut query = Query::term(first_term);
67            for term in iter {
68                let term_query = Query::term(term);
69                query = Query::and(query, term_query);
70            }
71            query
72        } else {
73            Query::none()
74        }
75    }
76
77    pub fn get_all_terms(&self) -> Vec<&str> {
78        let mut terms = Vec::new();
79        Self::collect_terms_ref(&self.inner, &mut terms);
80        terms
81    }
82
83    pub(crate) fn collect_terms_ref<'a>(query: &'a QueryInner, terms: &mut Vec<&'a str>) {
84        match query {
85            QueryInner::All => {
86                // do nothing
87            }
88            QueryInner::None => {}
89            QueryInner::Nop(term) => {
90                terms.push(term);
91            }
92            QueryInner::Not(inner) => {
93                Self::collect_terms_ref(inner, terms);
94            }
95            QueryInner::And(left, right) => {
96                Self::collect_terms_ref(left, terms);
97                Self::collect_terms_ref(right, terms);
98            }
99            QueryInner::Or(left, right) => {
100                Self::collect_terms_ref(left, terms);
101                Self::collect_terms_ref(right, terms);
102            }
103        }
104    }
105
106    pub(crate) fn build_ref<K>(query: &QueryInner, term_dim_rev_index: &IndexMap<Box<str>, Vec<u32>>, documents: &IndexSet<K>) -> Vec<usize> 
107    where 
108        K: Eq + std::hash::Hash,
109    {
110        match query {
111            QueryInner::All => {
112                let mut result = Vec::with_capacity(documents.len());
113                for (idx, _) in documents.iter().enumerate() {
114                    result.push(idx);
115                }
116                result
117            }
118            QueryInner::None => Vec::new(),
119            QueryInner::Nop(term) => {
120                if let Some(doc_keys) = term_dim_rev_index.get(term) {
121                    let mut result = doc_keys.iter().map(|&id| id as usize).collect::<Vec<usize>>();
122                    result.sort_unstable();
123                    result
124                } else {
125                    Vec::new()
126                }
127            }
128            QueryInner::Not(inner) => {
129                let inner_indices = Self::build_ref(inner, term_dim_rev_index, documents);
130                let mut result = Vec::with_capacity(documents.len() - inner_indices.len());
131                let mut inner_iter = inner_indices.iter().peekable();
132                for (idx, _) in documents.iter().enumerate() {
133                    match inner_iter.peek() {
134                        Some(&&inner_idx) if inner_idx == idx => {
135                            inner_iter.next();
136                        }
137                        _ => {
138                            result.push(idx);
139                        }
140                    }
141                }
142                result
143            }
144            QueryInner::And(left, right) => {
145                let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
146                let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
147                let mut result = Vec::with_capacity(std::cmp::min(left_indices.len(), right_indices.len()));
148                let mut l = 0;
149                let mut r = 0;
150                while l < left_indices.len() && r < right_indices.len() {
151                    match left_indices[l].cmp(&right_indices[r]) {
152                        std::cmp::Ordering::Less => {
153                            l += 1;
154                        }
155                        std::cmp::Ordering::Greater => {
156                            r += 1;
157                        }
158                        std::cmp::Ordering::Equal => {
159                            result.push(left_indices[l]);
160                            l += 1;
161                            r += 1;
162                        }
163                    }
164                }
165                result
166            }
167            QueryInner::Or(left, right) => {
168                let left_indices = Self::build_ref(left, term_dim_rev_index, documents);
169                let right_indices = Self::build_ref(right, term_dim_rev_index, documents);
170                let mut result = Vec::with_capacity(left_indices.len() + right_indices.len());
171                let mut l = 0;
172                let mut r = 0;
173                while l < left_indices.len() || r < right_indices.len() {
174                    if l >= left_indices.len() {
175                        result.push(right_indices[r]);
176                        r += 1;
177                    } else if r >= right_indices.len() {
178                        result.push(left_indices[l]);
179                        l += 1;
180                    } else {
181                        match left_indices[l].cmp(&right_indices[r]) {
182                            std::cmp::Ordering::Less => {
183                                result.push(left_indices[l]);
184                                l += 1;
185                            }
186                            std::cmp::Ordering::Greater => {
187                                result.push(right_indices[r]);
188                                r += 1;
189                            }
190                            std::cmp::Ordering::Equal => {
191                                result.push(left_indices[l]);
192                                l += 1;
193                                r += 1;
194                            }
195                        }
196                    }
197                }
198                result
199            }
200        }
201    }
202
203    pub fn build<K>(&self, term_dim_rev_index: &IndexMap<Box<str>, Vec<u32>>, documents: &IndexSet<K>) -> Vec<usize> 
204    where 
205        K: Eq + std::hash::Hash,
206    {
207        let mut res = Self::build_ref(&self.inner, term_dim_rev_index, documents);
208        res.sort_unstable();
209        res.dedup();
210        res
211    }
212}