intent_engine/search.rs
1//! CJK (Chinese, Japanese, Korean) search utilities
2//!
3//! This module provides utilities for detecting CJK characters and determining
4//! when to use LIKE fallback vs FTS5 trigram search.
5//!
6//! **Background**: SQLite FTS5 with trigram tokenizer requires at least 3 consecutive
7//! characters to match. This is problematic for CJK languages where single-character
8//! or two-character searches are common (e.g., "用户", "认证").
9//!
10//! **Solution**: For short CJK queries, we fallback to LIKE search which supports
11//! any length substring matching, albeit slower.
12
13/// Check if a character is a CJK character
14pub fn is_cjk_char(c: char) -> bool {
15 let code = c as u32;
16 matches!(code,
17 // CJK Unified Ideographs (most common Chinese characters)
18 0x4E00..=0x9FFF |
19 // CJK Extension A
20 0x3400..=0x4DBF |
21 // CJK Extension B-F (less common, but included for completeness)
22 0x20000..=0x2A6DF |
23 0x2A700..=0x2B73F |
24 0x2B740..=0x2B81F |
25 0x2B820..=0x2CEAF |
26 0x2CEB0..=0x2EBEF |
27 // Hiragana (Japanese)
28 0x3040..=0x309F |
29 // Katakana (Japanese)
30 0x30A0..=0x30FF |
31 // Hangul Syllables (Korean)
32 0xAC00..=0xD7AF
33 )
34}
35
36/// Determine if a query should use LIKE fallback instead of FTS5 trigram
37///
38/// Returns `true` if:
39/// - Query is a single CJK character, OR
40/// - Query is two CJK characters
41///
42/// Trigram tokenizer requires 3+ characters for matching, so we use LIKE
43/// for shorter CJK queries to ensure they work.
44pub fn needs_like_fallback(query: &str) -> bool {
45 let chars: Vec<char> = query.chars().collect();
46
47 // Single-character CJK
48 if chars.len() == 1 && is_cjk_char(chars[0]) {
49 return true;
50 }
51
52 // Two-character all-CJK
53 // This is optional - could also let trigram handle it, but trigram
54 // needs minimum 3 chars so two-char CJK won't work well
55 if chars.len() == 2 && chars.iter().all(|c| is_cjk_char(*c)) {
56 return true;
57 }
58
59 false
60}
61
62#[cfg(test)]
63mod tests {
64 use super::*;
65
66 #[test]
67 fn test_is_cjk_char() {
68 // Chinese characters
69 assert!(is_cjk_char('中'));
70 assert!(is_cjk_char('文'));
71 assert!(is_cjk_char('认'));
72 assert!(is_cjk_char('证'));
73
74 // Japanese Hiragana
75 assert!(is_cjk_char('あ'));
76 assert!(is_cjk_char('い'));
77
78 // Japanese Katakana
79 assert!(is_cjk_char('ア'));
80 assert!(is_cjk_char('イ'));
81
82 // Korean Hangul
83 assert!(is_cjk_char('가'));
84 assert!(is_cjk_char('나'));
85
86 // Non-CJK
87 assert!(!is_cjk_char('a'));
88 assert!(!is_cjk_char('A'));
89 assert!(!is_cjk_char('1'));
90 assert!(!is_cjk_char(' '));
91 assert!(!is_cjk_char('.'));
92 }
93
94 #[test]
95 fn test_needs_like_fallback() {
96 // Single CJK character - needs fallback
97 assert!(needs_like_fallback("中"));
98 assert!(needs_like_fallback("认"));
99 assert!(needs_like_fallback("あ"));
100 assert!(needs_like_fallback("가"));
101
102 // Two CJK characters - needs fallback
103 assert!(needs_like_fallback("中文"));
104 assert!(needs_like_fallback("认证"));
105 assert!(needs_like_fallback("用户"));
106
107 // Three+ CJK characters - can use FTS5
108 assert!(!needs_like_fallback("用户认"));
109 assert!(!needs_like_fallback("用户认证"));
110
111 // English - can use FTS5
112 assert!(!needs_like_fallback("JWT"));
113 assert!(!needs_like_fallback("auth"));
114 assert!(!needs_like_fallback("a")); // Single ASCII char, not CJK
115
116 // Mixed - can use FTS5
117 assert!(!needs_like_fallback("JWT认证"));
118 assert!(!needs_like_fallback("API接口"));
119 }
120}