1use adk_core::Part;
4use std::collections::HashSet;
5
6pub fn extract_text(content: &adk_core::Content) -> String {
11 content
12 .parts
13 .iter()
14 .filter_map(|part| match part {
15 Part::Text { text } => Some(text.as_str()),
16 _ => None,
17 })
18 .collect::<Vec<_>>()
19 .join(" ")
20}
21
22fn is_cjk_char(c: char) -> bool {
24 matches!(c,
25 '\u{4e00}'..='\u{9fff}' | '\u{3400}'..='\u{4dbf}' | '\u{f900}'..='\u{faff}' | '\u{2e80}'..='\u{2eff}' | '\u{3000}'..='\u{303f}' | '\u{3040}'..='\u{309f}' | '\u{30a0}'..='\u{30ff}' | '\u{ac00}'..='\u{d7af}' )
34}
35
36pub fn extract_words(text: &str) -> HashSet<String> {
43 let mut words = HashSet::new();
44
45 for token in text.split_whitespace() {
46 if token.is_empty() {
47 continue;
48 }
49 let lower = token.to_lowercase();
50
51 let has_cjk = lower.chars().any(is_cjk_char);
53
54 if has_cjk {
55 let chars: Vec<char> = lower.chars().collect();
58 for c in &chars {
59 if is_cjk_char(*c) {
60 words.insert(c.to_string());
61 }
62 }
63 for window in chars.windows(2) {
64 if window.iter().any(|c| is_cjk_char(*c)) {
65 let bigram: String = window.iter().collect();
66 words.insert(bigram);
67 }
68 }
69 words.insert(lower);
71 } else {
72 words.insert(lower);
73 }
74 }
75
76 if !text.contains(char::is_whitespace) && text.chars().any(is_cjk_char) {
78 let lower = text.to_lowercase();
79 let chars: Vec<char> = lower.chars().collect();
80 for c in &chars {
81 if is_cjk_char(*c) {
82 words.insert(c.to_string());
83 }
84 }
85 for window in chars.windows(2) {
86 if window.iter().any(|c| is_cjk_char(*c)) {
87 let bigram: String = window.iter().collect();
88 words.insert(bigram);
89 }
90 }
91 words.insert(lower);
93 }
94
95 words
96}
97
98pub fn extract_words_from_content(content: &adk_core::Content) -> HashSet<String> {
100 let mut words = HashSet::new();
101 for part in &content.parts {
102 if let Part::Text { text } = part {
103 words.extend(extract_words(text));
104 }
105 }
106 words
107}
108
109#[cfg(test)]
110mod tests {
111 use super::*;
112
113 #[test]
114 fn test_extract_words_english() {
115 let words = extract_words("Hello World foo bar");
116 assert!(words.contains("hello"));
117 assert!(words.contains("world"));
118 assert!(words.contains("foo"));
119 assert!(words.contains("bar"));
120 }
121
122 #[test]
123 fn test_extract_words_cjk_bigram_matching() {
124 let stored = extract_words("用户喜欢用Rust编程");
126 let query = extract_words("编程");
127
128 let matches: HashSet<_> = stored.intersection(&query).collect();
130 assert!(
131 !matches.is_empty(),
132 "CJK search should find matches. Stored: {stored:?}, Query: {query:?}"
133 );
134 }
135
136 #[test]
137 fn test_extract_words_cjk_single_char() {
138 let stored = extract_words("今天天气很好");
139 let query = extract_words("天气");
140
141 let matches: HashSet<_> = stored.intersection(&query).collect();
142 assert!(
143 !matches.is_empty(),
144 "CJK bigram '天气' should match. Stored: {stored:?}, Query: {query:?}"
145 );
146 }
147
148 #[test]
149 fn test_extract_words_mixed_cjk_english() {
150 let words = extract_words("Hello 你好 World");
151 assert!(words.contains("hello"));
152 assert!(words.contains("world"));
153 assert!(words.contains("你"));
154 assert!(words.contains("好"));
155 assert!(words.contains("你好"));
156 }
157
158 #[test]
159 fn test_extract_words_japanese() {
160 let stored = extract_words("東京タワー");
161 let query = extract_words("東京");
162
163 let matches: HashSet<_> = stored.intersection(&query).collect();
164 assert!(!matches.is_empty(), "Japanese bigram should match");
165 }
166}