1use std::collections::BTreeSet;
2
3const STOP_WORDS: &[&str] = &[
4 "the", "a", "an", "is", "are", "where", "what", "how", "for", "to", "before", "after", "and",
5 "or", "in", "of", "with", "on",
6];
7
8pub fn compact_query(input: &str) -> String {
9 let mut seen = BTreeSet::new();
10 let mut result = Vec::new();
11
12 for token in input
13 .split(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '\\')
14 .filter(|token| !token.is_empty())
15 {
16 let lower = token.to_ascii_lowercase();
17 if STOP_WORDS.contains(&lower.as_str()) {
18 continue;
19 }
20 if seen.insert(lower) {
21 result.push(token.to_string());
22 }
23 if result.len() >= 10 {
24 break;
25 }
26 }
27
28 result.join(" ")
29}
30
31#[cfg(test)]
32mod tests {
33 use super::compact_query;
34
35 #[test]
36 fn strips_noise_and_limits_words() {
37 let query = compact_query(
38 "where is consent checked before discharge export in the patient service and controller",
39 );
40 assert_eq!(
41 query,
42 "consent checked discharge export patient service controller"
43 );
44 }
45}